Merge remote-tracking branch 'origin/rai-npu-support' into iswarya/npu-support

Iswarya Alex · Iswarya Alex · commit c666bcb533e1 · 2026-01-22T22:44:23.000-08:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -91,6 +91,7 @@ endif()
 option(WHISPER_COREML                "whisper: enable Core ML framework"  OFF)
 option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
 option(WHISPER_OPENVINO              "whisper: support for OpenVINO"      OFF)
+option(WHISPER_VITISAI               "whisper: support for AMD Vitis AI"  OFF)
 
 # Required for relocatable CMake package
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
diff --git a/README.md b/README.md
@@ -21,6 +21,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - [Vulkan support](#vulkan-gpu-support)
 - Support for CPU-only inference
 - [Efficient GPU support for NVIDIA](#nvidia-gpu-support)
+- [AMD Ryzen AI NPU Support](#amd-ryzen-ai-support-for-npu)
 - [OpenVINO Support](#openvino-support)
 - [Ascend NPU Support](#ascend-npu-support)
 - [Moore Threads GPU Support](#moore-threads-gpu-support)
@@ -312,6 +313,46 @@ This can result in significant speedup in encoder performance. Here are the inst
 
 For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggml-org/whisper.cpp/pull/1037).
 
+## AMD Ryzen™ AI support for NPU
+
+On AMD's Ryzen™ AI 300 Series with dedicated NPUs for acceleration, you can now run Whisper models with the ability to fully offload the encoder to NPU. This brings significant speedup compared to CPU-only.
+> **Note:**  
+> **Ryzen™ AI NPU acceleration is currently supported on Windows only.** Linux support is planned for upcoming releases.  
+> For the latest updates on Ryzen AI, check out [the official documentation](https://ryzenai.docs.amd.com/en/latest/).
+
+### Setup environment (Windows only)
+
+- **Driver:** Make sure you have NPU drivers version **.280 or newer** installed. [Download latest drivers from here](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=NPU_RAI1.5_280_WHQL.zip)
+- **Runtime libraries:** Download and install the necessary [runtime dependencies from here](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=NPU_RAI1.5_280_WHQL.zip).
+- **Environment:** Extract the runtime package and set up the environment:
+  ```powershell
+  tar xvf flexmlrt1.7rc3.zip
+  flexmlrt\setup.bat
+  ```
+Your environment is now ready.
+
+### Build Whisper.cpp for Ryzen™ AI support
+
+```bash
+cmake -B build -DWHISPER_VITISAI=1
+cmake --build build -j --config Release
+```
+
+### Download NPU-optimized models
+
+- All NPU-supported Whisper models and their compiled `.rai` cache files are available in this collection:  
+  https://huggingface.co/collections/amd/ryzen-ai-16-whisper-npu-optimized-onnx-models
+- Download the `.rai` file matching your desired model, and place it in your `models/` directory alongside its corresponding `ggml-<...>.bin` file.
+
+> **Note:** The ".rai" models from Hugging Face are pre-optimized for Ryzen™ AI NPUs, delivering acceleration benefits from the very first run (aside from any initial CPU-side caching overhead).
+
+Run the examples as usual:
+
+```bash
+./build/bin/whisper-cli -m models/ggml-base.en.bin -f samples/jfk.wav
+```
+
+
 ## NVIDIA GPU support
 
 With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -48,6 +48,10 @@ if (WHISPER_OPENVINO)
     find_package(OpenVINO REQUIRED COMPONENTS Runtime)
 endif()
 
+if (WHISPER_VITISAI)
+    find_package(FlexmlRT REQUIRED)
+endif()
+
 #
 # libraries
 #
@@ -101,6 +105,30 @@ if (WHISPER_OPENVINO)
     set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
 endif()
 
+if (WHISPER_VITISAI)
+    set(TARGET whisper.vitisai)
+
+    add_library(${TARGET} OBJECT
+        vitisai/whisper-vitisai-encoder.h
+        vitisai/whisper-vitisai-encoder.cpp
+        )
+
+    target_include_directories(${TARGET} PUBLIC
+        .
+        )
+
+    set_property(TARGET ${TARGET} PROPERTY POSITION_INDEPENDENT_CODE ON)
+    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_VITISAI)
+
+    # Add C++17 standard for MSVC
+    if (MSVC)
+        target_compile_options(${TARGET} PRIVATE /std:c++17)
+    endif()
+
+    target_link_libraries(${TARGET} PRIVATE ggml flexmlrt::flexmlrt)
+    set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
+endif()
+
 # whisper
 
 add_library(whisper
@@ -137,6 +165,10 @@ if (WHISPER_OPENVINO)
     target_link_libraries(whisper PRIVATE whisper.openvino)
 endif()
 
+if (WHISPER_VITISAI)
+    target_link_libraries(whisper PRIVATE whisper.vitisai)
+endif()
+
 if (WHISPER_MKL)
     target_link_libraries(whisper PRIVATE MKL::MKL)
 endif()
diff --git a/src/vitisai/whisper-vitisai-encoder.cpp b/src/vitisai/whisper-vitisai-encoder.cpp
@@ -0,0 +1,204 @@
+// Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+#include "vitisai/whisper-vitisai-encoder.h"
+#include "FlexMLClient.h"
+#include "ggml.h"
+#include "ggml-backend.h"
+
+#include <cstdio>
+#include <cstdlib>
+#ifdef _WIN32
+    #include <windows.h>
+#else
+    #include <sys/mman.h>
+    #include <sys/stat.h>
+    #include <fcntl.h>
+#endif
+#include <cstring>
+#include <string>
+
+struct whisper_vitisai_context {
+    std::string model_path;
+    std::shared_ptr<flexmlrt::client::Model> runner;
+    uint8_t * fbs_buffer;
+    size_t fbs_buffer_size;
+};
+
+// Function to mmap rai file for Linux and MapViewOfFile for Windows
+bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size) {
+#ifdef _WIN32
+    // Open the file
+    HANDLE hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+    if (hFile == INVALID_HANDLE_VALUE) {
+        std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Get the file size
+    LARGE_INTEGER fileSize;
+    if (!GetFileSizeEx(hFile, &fileSize)) {
+        CloseHandle(hFile);
+        std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Create a file mapping object
+    HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, fileSize.QuadPart, NULL);
+    if (hMapping == NULL) {
+        CloseHandle(hFile);
+        std::fprintf(stderr, "%s: %d: Failed to create file mapping for rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Map the file
+    *buffer = (uint8_t *)MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, fileSize.QuadPart);
+    if (*buffer == NULL) {
+        CloseHandle(hMapping);
+        CloseHandle(hFile);
+        std::fprintf(stderr, "%s: %d: Failed to map rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+    *size = fileSize.QuadPart;
+    return true;
+#else
+    // Open the file
+    FILE * fd = fopen(path, "rb");
+    if (!fd) {
+        std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Get the file size
+    struct stat st;
+    if (fstat(fileno(fd), &st) == -1) {
+        fclose(fd);
+        std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+
+    // Mmap the file
+    *buffer = (uint8_t *)mmap(nullptr, st.st_size, PROT_READ, MAP_SHARED, fileno(fd), 0);
+    if (*buffer == MAP_FAILED) {
+        fclose(fd);
+        std::fprintf(stderr, "%s: %d: Failed to mmap rai file '%s'\n", __func__, __LINE__, path);
+        return false;
+    }
+    *size = st.st_size;
+    return true;
+#endif // _WIN32
+}
+
+void unmap_rai_file(uint8_t * buffer, size_t size) {
+#ifdef _WIN32
+    UnmapViewOfFile(buffer);
+#else
+    munmap(buffer, size);
+#endif // _WIN32
+}
+
+struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model) {
+    if (!path_model) {
+        std::fprintf(stderr, "%s: path_model is null\n", __func__);
+        return nullptr;
+    }
+
+    auto * ctx = new whisper_vitisai_context;
+    ctx->model_path = path_model;
+
+    // Override the model path with the environment variable if it is set
+    if (const char * env_model_path = std::getenv("OVERRIDE_VITISAI_MODEL_PATH")) {
+        if (env_model_path[0] != '\0') {
+            ctx->model_path = env_model_path;
+        }
+    }
+
+    // Step 1: Set up the model
+    flexmlrt::client::Options options;
+    options.modelPath = ctx->model_path;
+    options.deviceName = "stx";
+    options.debug = false;
+    options.executeMode = 2;
+    options.extOptions["ai_analyzer_profiling"] = true; // Enable AIA profiling
+    options.extOptions["enable_preemption"] = true;
+
+    // Check if model_path is rai file and if so, add fbs_buffer and fbs_buffer_size to the options
+    if (ctx->model_path.find(".rai") != std::string::npos) {
+        // mmap rai file for both Linux and Windows and pass the buffer to the options
+        ctx->fbs_buffer = nullptr;
+        ctx->fbs_buffer_size = 0;
+        if (map_rai_file(ctx->model_path.c_str(), &ctx->fbs_buffer, &ctx->fbs_buffer_size)) {
+            options.extOptions["fbs_buffer"] = ctx->fbs_buffer;
+            options.extOptions["fbs_buffer_size"] = ctx->fbs_buffer_size;
+            options.subgraphName = "vaiml_par_0";
+            options.extOptions["cache_dir"] = std::string(".");
+        } else {
+            std::fprintf(stderr, "%s: Failed to mmap rai file '%s'\n", __func__, ctx->model_path.c_str());
+            delete ctx;
+            return nullptr;
+        }
+    }
+
+    try {
+        ctx->runner = std::make_shared<flexmlrt::client::Model>(options);
+
+        if (!ctx->runner->good()) {
+            throw std::runtime_error("Runner creation ran into an error");
+        }
+    } catch (const std::exception & e) {
+        std::fprintf(stderr, "%s: Exception during Vitis AI runner creation: %s\n", __func__, e.what());
+        delete ctx;
+        return nullptr;
+    }
+    return ctx;
+}
+
+void whisper_vitisai_free(struct whisper_vitisai_context * ctx) {
+    if (!ctx) {
+        return;
+    }
+
+    std::fprintf(stderr, "%s: releasing Vitis AI encoder context for model '%s'\n", __func__, ctx->model_path.c_str());
+    if (ctx->fbs_buffer) {
+        unmap_rai_file(ctx->fbs_buffer, ctx->fbs_buffer_size);
+    }
+    delete ctx;
+}
+
+int whisper_vitisai_encode(struct whisper_vitisai_context * ctx, struct ggml_tensor * mel, struct ggml_tensor * out) {
+    if (!ctx || !mel || !out) {
+        std::fprintf(stderr, "%s: ctx/mel/out must not be null\n", __func__);
+        return 0;
+    }
+
+    if (ggml_n_dims(mel) != 2) {
+        std::fprintf(stderr, "%s: mel tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(mel));
+        return 0;
+    }
+
+    if (ggml_n_dims(out) != 2) {
+        std::fprintf(stderr, "%s: out tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(out));
+        return 0;
+    }
+
+    // setup input and output tensors for Vitis AI model
+    std::vector<flexmlrt::client::ErtTensorType> input_tensors, output_tensors;
+    auto model = ctx->runner;
+
+    // Get tensors as CPU tensors (hwTensor = false)
+    input_tensors = model->getIOTensors("input", false);
+    output_tensors = model->getIOTensors("output", false);
+
+    // TODO: add assert checks for tensor numbers and shapes
+
+    input_tensors[0].data = mel->data;
+    output_tensors[0].data = out->data;
+
+    try {
+        model->forward(input_tensors, output_tensors);
+        std::fprintf(stdout, "%s: Vitis AI model inference completed.\n", __func__);
+    } catch (const std::exception & e) {
+        std::fprintf(stderr, "%s: Exception during model inference: %s\n", __func__, e.what());
+        return 0;
+    }
+
+    return 1;
+}
diff --git a/src/vitisai/whisper-vitisai-encoder.h b/src/vitisai/whisper-vitisai-encoder.h
@@ -0,0 +1,32 @@
+// Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstddef>
+#include <cstdbool>
+#include <cstdint>
+
+#if __cplusplus
+extern "C" {
+#endif
+
+struct whisper_vitisai_context;
+
+struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model);
+void whisper_vitisai_free(struct whisper_vitisai_context * ctx);
+
+// Function to mmap rai file for Linux and MapViewOfFile for Windows
+bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size);
+// Function to unmap rai file for Linux and UnmapViewOfFile for Windows
+void unmap_rai_file(uint8_t * buffer, size_t size);
+
+struct ggml_tensor;
+
+int whisper_vitisai_encode(
+    struct whisper_vitisai_context * ctx,
+    struct ggml_tensor * mel,
+    struct ggml_tensor * out);
+
+#if __cplusplus
+}
+#endif
diff --git a/src/whisper.cpp b/src/whisper.cpp