Skip to content

Commit 4d18e03

Browse files
committed
[aoti-et] Add an ASR runner and an Whisper example to showcase how to
use it **Key Changes:** * Create new ASR runner extension in `extension/asr/runner/` with reusable runner components (runner.h/cpp) * Update CMake configuration files to support ASR runner builds (executorch-config.cmake, default.cmake, llm.cmake) * Add new Whisper model example in `examples/models/whisper/` with CMake build, README, and main.cpp runner * Bump optimum-executorch commit pin for Whisper support * Update CUDA CI workflow for testing This change enables automatic speech recognition (ASR) capabilities in ExecuTorch with Whisper as the first supported model, following a similar pattern to the existing LLM runner infrastructure.
1 parent cc72b35 commit 4d18e03

File tree

12 files changed

+977
-257
lines changed

12 files changed

+977
-257
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
467660923a5a25e4718e1d6697b93ff1bab4e807
1+
4361747abfc55e40e929396ed986efe775d745f9

.github/workflows/cuda.yml

Lines changed: 146 additions & 256 deletions
Large diffs are not rendered by default.

CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -926,6 +926,11 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
926926
list(APPEND _executorch_extensions extension_llm_runner)
927927
endif()
928928

929+
if(EXECUTORCH_BUILD_EXTENSION_ASR_RUNNER)
930+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/asr/runner)
931+
list(APPEND _executorch_extensions extension_asr_runner)
932+
endif()
933+
929934
if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
930935
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
931936
endif()
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
7+
cmake_minimum_required(VERSION 3.29)
8+
project(whisper_runner)
9+
10+
set(CMAKE_CXX_STANDARD 17)
11+
set(CMAKE_CXX_STANDARD_REQUIRED ON)
12+
13+
set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
14+
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
15+
16+
# Let files say "include <executorch/path/to/header.h>"
17+
set(_common_include_directories ${EXECUTORCH_ROOT}/..)
18+
19+
# Need this for gflags for some reason
20+
set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
21+
find_package(gflags REQUIRED)
22+
23+
list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
24+
find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
25+
26+
set(_link_libraries executorch gflags)
27+
set(_srcs multimodal.cpp)
28+
29+
list(
30+
APPEND
31+
_link_libraries
32+
optimized_native_cpu_ops_lib
33+
quantized_ops_lib
34+
custom_ops
35+
cpublas
36+
eigen_blas
37+
)
38+
39+
# XNNPACK
40+
if(TARGET xnnpack_backend)
41+
list(APPEND _link_libraries xnnpack_backend)
42+
endif()
43+
44+
# Add LLM runner and extension module
45+
if(NOT TARGET extension_asr_runner)
46+
message(
47+
FATAL_ERROR
48+
"ExecuTorch must be installed with EXECUTORCH_BUILD_EXTENSION_ASR_RUNNER enabled."
49+
)
50+
endif()
51+
52+
# Needed for cpuinfo where it uses android specific log lib
53+
if(ANDROID)
54+
list(APPEND _link_libraries log)
55+
endif()
56+
57+
# Add the required ExecuTorch extensions for multimodal LLM runner
58+
list(
59+
APPEND
60+
_link_libraries
61+
extension_asr_runner
62+
extension_llm_runner # Needed for load_tokenizer()
63+
extension_module
64+
extension_data_loader
65+
extension_tensor
66+
extension_flat_tensor
67+
)
68+
69+
# Link CUDA backend
70+
if(EXECUTORCH_BUILD_CUDA)
71+
find_package(CUDAToolkit REQUIRED)
72+
list(APPEND _link_libraries aoti_cuda)
73+
executorch_target_link_options_shared_lib(aoti_cuda)
74+
endif()
75+
76+
if(EXECUTORCH_BUILD_METAL)
77+
list(APPEND _link_libraries metal_backend)
78+
executorch_target_link_options_shared_lib(metal_backend)
79+
endif()
80+
81+
# Add tokenizers
82+
list(APPEND _link_libraries tokenizers::tokenizers)
83+
84+
add_executable(whisper_runner main.cpp)
85+
86+
target_include_directories(whisper_runner PUBLIC ${_common_include_directories})
87+
88+
target_link_libraries(whisper_runner PUBLIC ${_link_libraries})
89+
target_compile_options(whisper_runner PUBLIC ${_common_compile_options})

examples/models/whisper/README.md

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
# Whisper Runner
2+
3+
This directory hosts a lightweight C++ helper that drives Whisper models
4+
exported to ExecuTorch. The `AsrRunner` owns the `Module` instance that
5+
wraps a bundled `.pte` program and optional `.ptd` weight file, loads the
6+
`encoder` and `text_decoder` methods, and exposes a `transcribe()` loop that
7+
streams decoded text pieces through a callback.
8+
9+
The runner assumes:
10+
- `model.pte` contains both Whisper encoder and decoder entry points named
11+
`encoder` and `text_decoder`.
12+
- External parameters (for example KV cache blocks) are stored in a companion
13+
`model.ptd`.
14+
- A tokenizer JSON compatible with the ExecuTorch tokenizers shim is available.
15+
16+
Audio preprocessing is not part of the runner itself. To transform raw audio
17+
into the mel features expected by the encoder, reuse the pattern in
18+
`examples/models/voxtral/multimodal.cpp`, which loads a `preprocessor.pte`
19+
module to generate the spectrogram tensor.
20+
21+
## Build
22+
23+
```bash
24+
# Install ExecuTorch libraries:
25+
cmake --preset llm -DEXECUTORCH_BUILD_CUDA=ON -DCMAKE_INSTALL_PREFIX=cmake-out -DCMAKE_BUILD_TYPE=Release . -Bcmake-out
26+
cmake --build cmake-out -j$(nproc) --target install --config Release
27+
28+
# Build the runner:
29+
cmake \
30+
-B cmake-out/examples/models/whisper \
31+
-S examples/models/whisper
32+
cmake --build cmake-out/examples/models/whisper -j
33+
```
34+
35+
The first cmake command build produces a static library named `extension_asr_runner`. The second cmake command links it into your
36+
application together with the standard ExecuTorch runtime libraries and the
37+
tokenizer target (`tokenizers::tokenizers`).
38+
39+
## Usage
40+
41+
```cpp
42+
#include <executorch/extension/asr/runner.h>
43+
#include <executorch/extension/tensor/tensor_ptr.h>
44+
45+
using executorch::extension::llm::AsrRunner;
46+
using executorch::extension::llm::AsrTranscribeConfig;
47+
48+
AsrRunner runner("model.pte", "model.ptd", "tokenizer.json");
49+
ET_CHECK_OK(runner.load());
50+
51+
// `features` is the mel spectrogram tensor produced by the preprocessor.
52+
executorch::aten::Tensor features = load_features_somehow();
53+
54+
AsrTranscribeConfig config;
55+
config.max_new_tokens = 128; // stop after 128 generated tokens
56+
config.temperature = 0.7f; // optional: enable stochastic sampling
57+
config.decoder_start_token_id = 50257; // override the BOS token id
58+
59+
auto tokens_result = runner.transcribe(
60+
features,
61+
config,
62+
[](const std::string& piece) {
63+
std::cout << piece;
64+
});
65+
66+
if (!tokens_result.ok()) {
67+
ET_LOG(Error, "Transcription failed: %d", static_cast<int>(tokens_result.error()));
68+
}
69+
```
70+
71+
`transcribe()` returns the full token history (prompt + generated tokens) and
72+
invokes the callback every time a new token is emitted. Provide a non-empty
73+
`decoder_input_ids` vector if you want to seed the decoder with a custom prompt,
74+
and override `AsrTranscribeConfig::eos_token_ids` when the model exposes
75+
custom termination ids.

examples/models/whisper/main.cpp

Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <cmath>
10+
#include <cstring>
11+
#include <fstream>
12+
#include <iostream>
13+
#include <limits>
14+
#include <memory>
15+
#include <string>
16+
#include <vector>
17+
18+
#include <gflags/gflags.h>
19+
20+
#include <executorch/extension/asr/runner/runner.h>
21+
#include <executorch/extension/llm/runner/util.h>
22+
#include <executorch/extension/llm/runner/wav_loader.h>
23+
#include <executorch/extension/module/module.h>
24+
#include <executorch/extension/tensor/tensor_ptr_maker.h>
25+
#include <executorch/runtime/core/evalue.h>
26+
#include <executorch/runtime/platform/log.h>
27+
28+
DEFINE_string(model_path, "model.pte", "Path to Whisper model (.pte).");
29+
DEFINE_string(data_path, "", "Optional path to Whisper weights (.ptd).");
30+
DEFINE_string(
31+
tokenizer_path,
32+
".",
33+
"Path to tokenizer directory containing tokenizer.json, tokenizer_config.json, and special_tokens_map.json.");
34+
DEFINE_string(
35+
processor_path,
36+
"",
37+
"Path to preprocessor .pte for converting raw audio.");
38+
DEFINE_string(
39+
audio_path,
40+
"",
41+
"Path to input audio file. Accepts .wav or raw float .bin.");
42+
DEFINE_double(
43+
temperature,
44+
0.0,
45+
"Sampling temperature. 0.0 performs greedy decoding.");
46+
DEFINE_int32(max_new_tokens, 128, "Maximum number of tokens to generate.");
47+
48+
using ::executorch::extension::from_blob;
49+
using ::executorch::extension::Module;
50+
51+
int main(int argc, char** argv) {
52+
gflags::ParseCommandLineFlags(&argc, &argv, true);
53+
::executorch::extension::TensorPtr features;
54+
std::vector<float> audio_data;
55+
std::unique_ptr<Module> processor;
56+
57+
if (FLAGS_audio_path.empty()) {
58+
ET_LOG(Error, "audio_path flag must be provided.");
59+
return 1;
60+
}
61+
62+
audio_data =
63+
executorch::extension::llm::load_wav_audio_data(FLAGS_audio_path);
64+
ET_LOG(
65+
Info,
66+
"First 2 values of audio data: %f, %f",
67+
audio_data[0],
68+
audio_data[1]);
69+
70+
processor =
71+
std::make_unique<Module>(FLAGS_processor_path, Module::LoadMode::Mmap);
72+
auto load_error = processor->load();
73+
if (load_error != ::executorch::runtime::Error::Ok) {
74+
ET_LOG(Error, "Failed to load preprocessor module.");
75+
return 1;
76+
}
77+
78+
auto audio_tensor = from_blob(
79+
audio_data.data(),
80+
{static_cast<::executorch::aten::SizesType>(audio_data.size())},
81+
::executorch::aten::ScalarType::Float);
82+
83+
auto processed_result = processor->execute("forward", audio_tensor);
84+
if (processed_result.error() != ::executorch::runtime::Error::Ok) {
85+
ET_LOG(Error, "Audio preprocessing failed.");
86+
return 1;
87+
}
88+
auto outputs = std::move(processed_result.get());
89+
if (outputs.empty() || !outputs[0].isTensor()) {
90+
ET_LOG(Error, "Preprocessor returned unexpected outputs.");
91+
return 1;
92+
}
93+
auto tensor = outputs[0].toTensor();
94+
ET_LOG(
95+
Info,
96+
"Result scalar_type: %s, first value %f",
97+
::executorch::runtime::toString(tensor.scalar_type()),
98+
tensor.mutable_data_ptr<float>()[0]);
99+
features = std::make_shared<::executorch::aten::Tensor>(std::move(tensor));
100+
101+
executorch::extension::asr::AsrRunner runner(
102+
FLAGS_model_path, FLAGS_data_path, FLAGS_tokenizer_path);
103+
auto load_err = runner.load();
104+
if (load_err != ::executorch::runtime::Error::Ok) {
105+
ET_LOG(Error, "Failed to load Whisper model.");
106+
return 1;
107+
}
108+
109+
executorch::extension::asr::AsrTranscribeConfig config;
110+
config.max_new_tokens = FLAGS_max_new_tokens;
111+
config.temperature = static_cast<float>(FLAGS_temperature);
112+
config.decoder_start_token_id = 50257;
113+
114+
auto result =
115+
runner.transcribe(features, config, [&](const std::string& piece) {
116+
::executorch::extension::llm::safe_printf(piece.c_str());
117+
fflush(stdout);
118+
});
119+
120+
if (!result.ok()) {
121+
ET_LOG(Error, "Transcription failed.");
122+
return 1;
123+
}
124+
125+
return 0;
126+
}
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
#
7+
# ASR runner for models like Whisper
8+
#
9+
# ### Editing this file ###
10+
#
11+
# This file should be formatted with
12+
# ~~~
13+
# cmake-format -i CMakeLists.txt
14+
# ~~~
15+
# It should also be cmake-lint clean.
16+
#
17+
18+
if(NOT EXECUTORCH_ROOT)
19+
set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
20+
endif()
21+
22+
include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
23+
24+
set(runner_deps executorch_core extension_module extension_tensor
25+
tokenizers::tokenizers
26+
)
27+
28+
# Define runner library
29+
add_library(extension_asr_runner STATIC runner.cpp)
30+
target_include_directories(
31+
extension_asr_runner INTERFACE ${_common_include_directories}
32+
)
33+
target_link_libraries(extension_asr_runner PUBLIC ${runner_deps})
34+
set_target_properties(
35+
extension_asr_runner PROPERTIES POSITION_INDEPENDENT_CODE ON
36+
)
37+
38+
install(
39+
TARGETS extension_asr_runner
40+
EXPORT ExecuTorchTargets
41+
DESTINATION ${CMAKE_INSTALL_LIBDIR}
42+
INCLUDES
43+
DESTINATION ${_common_include_directories}
44+
)
45+
46+
install(
47+
DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/
48+
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/extension/asr/runner
49+
FILES_MATCHING
50+
PATTERN "*.h"
51+
)

0 commit comments

Comments
 (0)