Skip to content

Commit c666bcb

Browse files
author
Iswarya Alex
committed
Merge remote-tracking branch 'origin/rai-npu-support' into iswarya/npu-support
2 parents faee4d4 + 426daf2 commit c666bcb

File tree

6 files changed

+370
-1
lines changed

6 files changed

+370
-1
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ endif()
9191
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
9292
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
9393
option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF)
94+
option(WHISPER_VITISAI "whisper: support for AMD Vitis AI" OFF)
9495

9596
# Required for relocatable CMake package
9697
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)

README.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
2121
- [Vulkan support](#vulkan-gpu-support)
2222
- Support for CPU-only inference
2323
- [Efficient GPU support for NVIDIA](#nvidia-gpu-support)
24+
- [AMD Ryzen AI NPU Support](#amd-ryzen-ai-support-for-npu)
2425
- [OpenVINO Support](#openvino-support)
2526
- [Ascend NPU Support](#ascend-npu-support)
2627
- [Moore Threads GPU Support](#moore-threads-gpu-support)
@@ -312,6 +313,46 @@ This can result in significant speedup in encoder performance. Here are the inst
312313

313314
For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggml-org/whisper.cpp/pull/1037).
314315

316+
## AMD Ryzen™ AI support for NPU
317+
318+
On AMD's Ryzen™ AI 300 Series with dedicated NPUs for acceleration, you can now run Whisper models with the ability to fully offload the encoder to NPU. This brings significant speedup compared to CPU-only.
319+
> **Note:**
320+
> **Ryzen™ AI NPU acceleration is currently supported on Windows only.** Linux support is planned for upcoming releases.
321+
> For the latest updates on Ryzen AI, check out [the official documentation](https://ryzenai.docs.amd.com/en/latest/).
322+
323+
### Setup environment (Windows only)
324+
325+
- **Driver:** Make sure you have NPU drivers version **.280 or newer** installed. [Download latest drivers from here](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=NPU_RAI1.5_280_WHQL.zip)
326+
- **Runtime libraries:** Download and install the necessary [runtime dependencies from here](https://account.amd.com/en/forms/downloads/ryzenai-eula-public-xef.html?filename=NPU_RAI1.5_280_WHQL.zip).
327+
- **Environment:** Extract the runtime package and set up the environment:
328+
```powershell
329+
tar xvf flexmlrt1.7rc3.zip
330+
flexmlrt\setup.bat
331+
```
332+
Your environment is now ready.
333+
334+
### Build Whisper.cpp for Ryzen™ AI support
335+
336+
```bash
337+
cmake -B build -DWHISPER_VITISAI=1
338+
cmake --build build -j --config Release
339+
```
340+
341+
### Download NPU-optimized models
342+
343+
- All NPU-supported Whisper models and their compiled `.rai` cache files are available in this collection:
344+
https://huggingface.co/collections/amd/ryzen-ai-16-whisper-npu-optimized-onnx-models
345+
- Download the `.rai` file matching your desired model, and place it in your `models/` directory alongside its corresponding `ggml-<...>.bin` file.
346+
347+
> **Note:** The ".rai" models from Hugging Face are pre-optimized for Ryzen™ AI NPUs, delivering acceleration benefits from the very first run (aside from any initial CPU-side caching overhead).
348+
349+
Run the examples as usual:
350+
351+
```bash
352+
./build/bin/whisper-cli -m models/ggml-base.en.bin -f samples/jfk.wav
353+
```
354+
355+
315356
## NVIDIA GPU support
316357

317358
With NVIDIA cards the processing of the models is done efficiently on the GPU via cuBLAS and custom CUDA kernels.

src/CMakeLists.txt

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,10 @@ if (WHISPER_OPENVINO)
4848
find_package(OpenVINO REQUIRED COMPONENTS Runtime)
4949
endif()
5050

51+
if (WHISPER_VITISAI)
52+
find_package(FlexmlRT REQUIRED)
53+
endif()
54+
5155
#
5256
# libraries
5357
#
@@ -101,6 +105,30 @@ if (WHISPER_OPENVINO)
101105
set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
102106
endif()
103107

108+
if (WHISPER_VITISAI)
109+
set(TARGET whisper.vitisai)
110+
111+
add_library(${TARGET} OBJECT
112+
vitisai/whisper-vitisai-encoder.h
113+
vitisai/whisper-vitisai-encoder.cpp
114+
)
115+
116+
target_include_directories(${TARGET} PUBLIC
117+
.
118+
)
119+
120+
set_property(TARGET ${TARGET} PROPERTY POSITION_INDEPENDENT_CODE ON)
121+
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_VITISAI)
122+
123+
# Add C++17 standard for MSVC
124+
if (MSVC)
125+
target_compile_options(${TARGET} PRIVATE /std:c++17)
126+
endif()
127+
128+
target_link_libraries(${TARGET} PRIVATE ggml flexmlrt::flexmlrt)
129+
set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
130+
endif()
131+
104132
# whisper
105133

106134
add_library(whisper
@@ -137,6 +165,10 @@ if (WHISPER_OPENVINO)
137165
target_link_libraries(whisper PRIVATE whisper.openvino)
138166
endif()
139167

168+
if (WHISPER_VITISAI)
169+
target_link_libraries(whisper PRIVATE whisper.vitisai)
170+
endif()
171+
140172
if (WHISPER_MKL)
141173
target_link_libraries(whisper PRIVATE MKL::MKL)
142174
endif()
Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
// Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved.
2+
#include "vitisai/whisper-vitisai-encoder.h"
3+
#include "FlexMLClient.h"
4+
#include "ggml.h"
5+
#include "ggml-backend.h"
6+
7+
#include <cstdio>
8+
#include <cstdlib>
9+
#ifdef _WIN32
10+
#include <windows.h>
11+
#else
12+
#include <sys/mman.h>
13+
#include <sys/stat.h>
14+
#include <fcntl.h>
15+
#endif
16+
#include <cstring>
17+
#include <string>
18+
19+
struct whisper_vitisai_context {
20+
std::string model_path;
21+
std::shared_ptr<flexmlrt::client::Model> runner;
22+
uint8_t * fbs_buffer;
23+
size_t fbs_buffer_size;
24+
};
25+
26+
// Function to mmap rai file for Linux and MapViewOfFile for Windows
27+
bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size) {
28+
#ifdef _WIN32
29+
// Open the file
30+
HANDLE hFile = CreateFileA(path, GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
31+
if (hFile == INVALID_HANDLE_VALUE) {
32+
std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
33+
return false;
34+
}
35+
36+
// Get the file size
37+
LARGE_INTEGER fileSize;
38+
if (!GetFileSizeEx(hFile, &fileSize)) {
39+
CloseHandle(hFile);
40+
std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
41+
return false;
42+
}
43+
44+
// Create a file mapping object
45+
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, fileSize.QuadPart, NULL);
46+
if (hMapping == NULL) {
47+
CloseHandle(hFile);
48+
std::fprintf(stderr, "%s: %d: Failed to create file mapping for rai file '%s'\n", __func__, __LINE__, path);
49+
return false;
50+
}
51+
52+
// Map the file
53+
*buffer = (uint8_t *)MapViewOfFile(hMapping, FILE_MAP_READ, 0, 0, fileSize.QuadPart);
54+
if (*buffer == NULL) {
55+
CloseHandle(hMapping);
56+
CloseHandle(hFile);
57+
std::fprintf(stderr, "%s: %d: Failed to map rai file '%s'\n", __func__, __LINE__, path);
58+
return false;
59+
}
60+
*size = fileSize.QuadPart;
61+
return true;
62+
#else
63+
// Open the file
64+
FILE * fd = fopen(path, "rb");
65+
if (!fd) {
66+
std::fprintf(stderr, "%s: %d: Failed to open rai file '%s'\n", __func__, __LINE__, path);
67+
return false;
68+
}
69+
70+
// Get the file size
71+
struct stat st;
72+
if (fstat(fileno(fd), &st) == -1) {
73+
fclose(fd);
74+
std::fprintf(stderr, "%s: %d: Failed to get file size for rai file '%s'\n", __func__, __LINE__, path);
75+
return false;
76+
}
77+
78+
// Mmap the file
79+
*buffer = (uint8_t *)mmap(nullptr, st.st_size, PROT_READ, MAP_SHARED, fileno(fd), 0);
80+
if (*buffer == MAP_FAILED) {
81+
fclose(fd);
82+
std::fprintf(stderr, "%s: %d: Failed to mmap rai file '%s'\n", __func__, __LINE__, path);
83+
return false;
84+
}
85+
*size = st.st_size;
86+
return true;
87+
#endif // _WIN32
88+
}
89+
90+
void unmap_rai_file(uint8_t * buffer, size_t size) {
91+
#ifdef _WIN32
92+
UnmapViewOfFile(buffer);
93+
#else
94+
munmap(buffer, size);
95+
#endif // _WIN32
96+
}
97+
98+
struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model) {
99+
if (!path_model) {
100+
std::fprintf(stderr, "%s: path_model is null\n", __func__);
101+
return nullptr;
102+
}
103+
104+
auto * ctx = new whisper_vitisai_context;
105+
ctx->model_path = path_model;
106+
107+
// Override the model path with the environment variable if it is set
108+
if (const char * env_model_path = std::getenv("OVERRIDE_VITISAI_MODEL_PATH")) {
109+
if (env_model_path[0] != '\0') {
110+
ctx->model_path = env_model_path;
111+
}
112+
}
113+
114+
// Step 1: Set up the model
115+
flexmlrt::client::Options options;
116+
options.modelPath = ctx->model_path;
117+
options.deviceName = "stx";
118+
options.debug = false;
119+
options.executeMode = 2;
120+
options.extOptions["ai_analyzer_profiling"] = true; // Enable AIA profiling
121+
options.extOptions["enable_preemption"] = true;
122+
123+
// Check if model_path is rai file and if so, add fbs_buffer and fbs_buffer_size to the options
124+
if (ctx->model_path.find(".rai") != std::string::npos) {
125+
// mmap rai file for both Linux and Windows and pass the buffer to the options
126+
ctx->fbs_buffer = nullptr;
127+
ctx->fbs_buffer_size = 0;
128+
if (map_rai_file(ctx->model_path.c_str(), &ctx->fbs_buffer, &ctx->fbs_buffer_size)) {
129+
options.extOptions["fbs_buffer"] = ctx->fbs_buffer;
130+
options.extOptions["fbs_buffer_size"] = ctx->fbs_buffer_size;
131+
options.subgraphName = "vaiml_par_0";
132+
options.extOptions["cache_dir"] = std::string(".");
133+
} else {
134+
std::fprintf(stderr, "%s: Failed to mmap rai file '%s'\n", __func__, ctx->model_path.c_str());
135+
delete ctx;
136+
return nullptr;
137+
}
138+
}
139+
140+
try {
141+
ctx->runner = std::make_shared<flexmlrt::client::Model>(options);
142+
143+
if (!ctx->runner->good()) {
144+
throw std::runtime_error("Runner creation ran into an error");
145+
}
146+
} catch (const std::exception & e) {
147+
std::fprintf(stderr, "%s: Exception during Vitis AI runner creation: %s\n", __func__, e.what());
148+
delete ctx;
149+
return nullptr;
150+
}
151+
return ctx;
152+
}
153+
154+
void whisper_vitisai_free(struct whisper_vitisai_context * ctx) {
155+
if (!ctx) {
156+
return;
157+
}
158+
159+
std::fprintf(stderr, "%s: releasing Vitis AI encoder context for model '%s'\n", __func__, ctx->model_path.c_str());
160+
if (ctx->fbs_buffer) {
161+
unmap_rai_file(ctx->fbs_buffer, ctx->fbs_buffer_size);
162+
}
163+
delete ctx;
164+
}
165+
166+
int whisper_vitisai_encode(struct whisper_vitisai_context * ctx, struct ggml_tensor * mel, struct ggml_tensor * out) {
167+
if (!ctx || !mel || !out) {
168+
std::fprintf(stderr, "%s: ctx/mel/out must not be null\n", __func__);
169+
return 0;
170+
}
171+
172+
if (ggml_n_dims(mel) != 2) {
173+
std::fprintf(stderr, "%s: mel tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(mel));
174+
return 0;
175+
}
176+
177+
if (ggml_n_dims(out) != 2) {
178+
std::fprintf(stderr, "%s: out tensor expected to have 2 dims, got %d\n", __func__, ggml_n_dims(out));
179+
return 0;
180+
}
181+
182+
// setup input and output tensors for Vitis AI model
183+
std::vector<flexmlrt::client::ErtTensorType> input_tensors, output_tensors;
184+
auto model = ctx->runner;
185+
186+
// Get tensors as CPU tensors (hwTensor = false)
187+
input_tensors = model->getIOTensors("input", false);
188+
output_tensors = model->getIOTensors("output", false);
189+
190+
// TODO: add assert checks for tensor numbers and shapes
191+
192+
input_tensors[0].data = mel->data;
193+
output_tensors[0].data = out->data;
194+
195+
try {
196+
model->forward(input_tensors, output_tensors);
197+
std::fprintf(stdout, "%s: Vitis AI model inference completed.\n", __func__);
198+
} catch (const std::exception & e) {
199+
std::fprintf(stderr, "%s: Exception during model inference: %s\n", __func__, e.what());
200+
return 0;
201+
}
202+
203+
return 1;
204+
}
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
// Copyright(C) 2025 Advanced Micro Devices, Inc. All rights reserved.
2+
3+
#pragma once
4+
5+
#include <cstddef>
6+
#include <cstdbool>
7+
#include <cstdint>
8+
9+
#if __cplusplus
10+
extern "C" {
11+
#endif
12+
13+
struct whisper_vitisai_context;
14+
15+
struct whisper_vitisai_context * whisper_vitisai_init(const char * path_model);
16+
void whisper_vitisai_free(struct whisper_vitisai_context * ctx);
17+
18+
// Function to mmap rai file for Linux and MapViewOfFile for Windows
19+
bool map_rai_file(const char * path, uint8_t ** buffer, size_t * size);
20+
// Function to unmap rai file for Linux and UnmapViewOfFile for Windows
21+
void unmap_rai_file(uint8_t * buffer, size_t size);
22+
23+
struct ggml_tensor;
24+
25+
int whisper_vitisai_encode(
26+
struct whisper_vitisai_context * ctx,
27+
struct ggml_tensor * mel,
28+
struct ggml_tensor * out);
29+
30+
#if __cplusplus
31+
}
32+
#endif

0 commit comments

Comments
 (0)