Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions examples/models/voxtral/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ To run the model, we will use the Voxtral runner, which utilizes ExecuTorch's Mu
The Voxtral runner will do the following things:

- Audio Input:
- Option A: Pass the raw audio tensor into exported preprocessor to produce a mel spectrogram tensor.
- Option B: If starting directly with an already processed audio input tensor, format the inputs to the multimodal runner (metadata tokens, audio tokens, text tokens, etc.).
- Option A: Pass raw audio data from a `.wav` file into the exported preprocessor to produce a mel spectrogram tensor.
- Option B: If starting directly with an already processed audio input tensor (preprocessed mel spectrogram), format the inputs to the multimodal runner (metadata tokens, audio tokens, text tokens, etc.).
- Feed the formatted inputs to the multimodal modal runner.


Expand All @@ -66,13 +66,26 @@ cmake -DCMAKE_INSTALL_PREFIX=cmake-out -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Re

## Running the model
You can download the `tekken.json` tokenizer from [Voxtral's HuggingFace repo](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507).

### Running with raw audio (.wav file)
For raw audio files (`.wav`), you must provide a preprocessor to convert the audio into mel spectrogram format:
```
./cmake-out/examples/models/voxtral/voxtral_runner \
--model_path path/to/model.pte \
--tokenizer_path path/to/tekken.json \
--prompt "What can you tell me about this audio?" \
--audio_path path/to/audio_input.wav \
--processor_path path/to/voxtral_preprocessor.pte
```

### Running with preprocessed audio (.bin file)
If you already have a preprocessed mel spectrogram saved as a `.bin` file, you can skip the preprocessor:
```
./cmake-out/examples/models/voxtral/voxtral_runner \
--model_path path/to/model.pte \
--tokenizer_path path/to/tekken.json \
--prompt "What can you tell me about this audio?" \
--audio_path path/to/audio_input.bin \
--processor_path path/to/voxtral_preprocessor.pte # If you're passing raw audio file in audio_path
--audio_path path/to/preprocessed_audio.bin
```

Example output:
Expand Down
105 changes: 65 additions & 40 deletions examples/models/voxtral/multimodal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <executorch/extension/llm/runner/llm_runner_helper.h>
#include <executorch/extension/llm/runner/multimodal_input.h>
#include <executorch/extension/llm/runner/multimodal_runner.h>
#include <executorch/extension/llm/runner/wav_loader.h>
#include <executorch/runtime/core/error.h>
#include <executorch/runtime/platform/log.h>

Expand All @@ -34,6 +35,7 @@ DEFINE_string(
"multimodal.pte",
"Model serialized in flatbuffer format.");

DEFINE_string(data_path, "", "Path to data file.");
DEFINE_string(tokenizer_path, "tekken.json", "Tokenizer stuff.");

DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
Expand Down Expand Up @@ -113,15 +115,15 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
}

/**
* @brief Loads a .bin file into a tensor and processes it using a .pte
* processor
* @brief Loads raw audio from a .bin or .wav file and processes it using a
* .pte processor
*
* This function loads raw audio data from a .bin file (similar to
* loadPreprocessedAudio), creates a tensor from it, and then passes it through
* a processor module loaded from a .pte file to generate processed audio
* features.
* This function loads raw audio data from either a .bin file (raw float array)
* or a .wav file (WAV format with headers), creates a tensor from it, and then
* passes it through a processor module loaded from a .pte file to generate
* processed audio features.
*
* @param audio_path Path to the .bin audio file
* @param audio_path Path to the .bin or .wav audio file
* @param processor_path Path to the .pte processor file
* @return MultimodalInput containing the processed audio data
* @throws std::runtime_error if file loading or processing fails
Expand All @@ -135,6 +137,41 @@ MultimodalInput processRawAudioFile(
"Processor path is required for raw audio processing");
}

// Load the audio data from file (.bin or .wav)
std::vector<float> audio_data;
if (ends_with(audio_path, ".wav")) {
audio_data = ::executorch::extension::llm::load_wav_audio_data(audio_path);
ET_LOG(
Info,
"Loaded WAV file: %s, %zu samples",
audio_path.c_str(),
audio_data.size());
} else if (ends_with(audio_path, ".bin")) {
std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
if (!f.is_open()) {
ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
throw std::runtime_error("Failed to open audio file");
}

std::size_t n_floats = f.tellg() / sizeof(float);
f.seekg(0, std::ios::beg);

audio_data.resize(n_floats);
f.read(
reinterpret_cast<char*>(audio_data.data()),
audio_data.size() * sizeof(float));
f.close();

ET_LOG(
Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
} else {
ET_LOG(
Error,
"Unsupported audio file format: %s (only .bin and .wav files are supported)",
audio_path.c_str());
throw std::runtime_error("Unsupported audio file format");
}

// Load the audio processor .pte.
std::unique_ptr<Module> processor_module;
try {
Expand All @@ -153,25 +190,6 @@ MultimodalInput processRawAudioFile(
throw std::runtime_error("Exception while loading processor module");
}

// Load the audio data from file.
std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
if (!f.is_open()) {
ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
throw std::runtime_error("Failed to open audio file");
}

std::size_t n_floats = f.tellg() / sizeof(float);
f.seekg(0, std::ios::beg);

std::vector<float> audio_data(n_floats);
f.read(
reinterpret_cast<char*>(audio_data.data()),
audio_data.size() * sizeof(float));
f.close();

ET_LOG(
Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);

// Execute the processor
std::vector<executorch::aten::SizesType> tensor_shape = {
static_cast<executorch::aten::SizesType>(audio_data.size())};
Expand Down Expand Up @@ -226,33 +244,39 @@ MultimodalInput processRawAudioFile(
*
* Dispatches audio file processing based on file extension and processor
* availability:
* - .wav files: Requires processor, processes raw audio through processor
* - .bin files with processor: Loads raw audio from .bin and processes through
* processor
* - .bin files without processor: Loads preprocessed mel spectrogram features
* directly
*
* @param audio_path Path to the audio file (.bin)
* @param processor_path Path to the processor .pte file (optional)
* @param audio_path Path to the audio file (.bin or .wav)
* @param processor_path Path to the processor .pte file (optional for .bin,
* required for .wav)
* @return MultimodalInput containing the processed audio data
* @throws std::runtime_error if file format is unsupported or processing fails
*/
MultimodalInput processAudioFile(
const std::string& audio_path,
const std::string& processor_path = "") {
if (ends_with(audio_path, ".bin")) {
if (!processor_path.empty()) {
// Process raw audio from .bin file through the processor
return processRawAudioFile(audio_path, processor_path);
} else {
// Load preprocessed audio stored as a binary file (existing behavior)
return loadPreprocessedAudio(audio_path);
if (ends_with(audio_path, ".wav") || ends_with(audio_path, ".bin")) {
if (processor_path.empty()) {
if (ends_with(audio_path, ".wav")) {
ET_CHECK_MSG(
false,
"Processor path is required for .wav file processing: %s",
audio_path.c_str());
} else {
// Load preprocessed audio stored as a binary file (existing behavior)
return loadPreprocessedAudio(audio_path);
}
}
return processRawAudioFile(audio_path, processor_path);
} else {
ET_LOG(
Error,
"Unsupported audio file format: %s (only .bin files are supported)",
ET_CHECK_MSG(
false,
"Unsupported audio file format: %s (only .bin and .wav files are supported)",
audio_path.c_str());
throw std::runtime_error("Unsupported audio file format");
}
}

Expand All @@ -267,6 +291,7 @@ int32_t main(int32_t argc, char** argv) {
const char* prompt = FLAGS_prompt.c_str();
const char* audio_path = FLAGS_audio_path.c_str();
const char* processor_path = FLAGS_processor_path.c_str();
const char* data_path = FLAGS_data_path.c_str();
float temperature = FLAGS_temperature;
int32_t cpu_threads = FLAGS_cpu_threads;
bool warmup = FLAGS_warmup;
Expand Down Expand Up @@ -294,7 +319,7 @@ int32_t main(int32_t argc, char** argv) {
// Create multimodal runner
std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner =
::executorch::extension::llm::create_multimodal_runner(
model_path, std::move(tokenizer));
model_path, std::move(tokenizer), data_path);
if (runner == nullptr) {
ET_LOG(Error, "Failed to create multimodal runner");
return 1;
Expand Down
1 change: 1 addition & 0 deletions extension/llm/runner/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ def define_common_targets():
exported_headers = [
"audio.h",
"image.h",
"wav_loader.h",
"multimodal_input.h",
"multimodal_runner.h",
"multimodal_prefiller.h",
Expand Down
2 changes: 1 addition & 1 deletion extension/llm/runner/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)

set(_test_srcs
test_generation_config.cpp test_text_llm_runner.cpp test_text_prefiller.cpp
test_text_decoder_runner.cpp test_multimodal_input.cpp
test_text_decoder_runner.cpp test_multimodal_input.cpp test_wav_loader.cpp
)

# Add LSan stub for Apple platforms
Expand Down
10 changes: 10 additions & 0 deletions extension/llm/runner/test/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,13 @@ def define_common_targets():
"//executorch/extension/llm/runner:multimodal_runner_lib",
],
)

runtime.cxx_test(
name = "test_wav_loader",
srcs = ["test_wav_loader.cpp"],
deps = [
"//executorch/extension/testing_util:temp_file",
"//executorch/extension/llm/runner:multimodal_runner_lib",
"//executorch/runtime/platform:platform",
],
)
Loading
Loading