Skip to content

Commit 624463e

Browse files
authored
Include audio preprocessing for raw audio tensor (#13855)
## Summary Runs audio preprocessing (mel spectrogram conversion) on raw audio tensor .bin file, using an exported `.pte` from https://github.com/pytorch/executorch/blob/main/extension/audio/mel_spectrogram.py Current limitations - no batching of output in the spectrogram processing module, so can only support audio of <30 seconds. ``` The speaker in this audio seems to be talking about their concerns about a device called the model or maybe they're just talking about the model in general. They mention that the model was trained with the speaker for inference, which suggests that the model was trained based on the speaker's data or instructions. They also mention that the volume is quite small, which could imply that the speaker is trying to control the volume of the model's output, likely because they are concerned about how loud the model's responses might PyTorchObserver {"prompt_tokens":388,"generated_tokens":99,"model_load_start_ms":0,"model_load_end_ms":0,"inference_start_ms":1756351346381,"inference_end_ms":1756351362602,"prompt_eval_end_ms":1756351351435,"first_token_ms":1756351351435,"aggregate_sampling_time_ms":99,"SCALING_FACTOR_UNITS_PER_SECOND":1000} I 00:00:24.036773 executorch:stats.h:104] Prompt Tokens: 388 Generated Tokens: 99 I 00:00:24.036800 executorch:stats.h:110] Model Load Time: 0.000000 (seconds) I 00:00:24.036805 executorch:stats.h:117] Total inference time: 16.221000 (seconds) Rate: 6.103200 (tokens/second) I 00:00:24.036815 executorch:stats.h:127] Prompt evaluation: 5.054000 (seconds) Rate: 76.770875 (tokens/second) I 00:00:24.036819 executorch:stats.h:136] Generated 99 tokens: 11.167000 (seconds) Rate: 8.865407 (tokens/second) I 00:00:24.036822 executorch:stats.h:147] Time to first generated token: 5.054000 (seconds) I 00:00:24.036828 executorch:stats.h:153] Sampling time over 487 tokens: 0.099000 (seconds) ```
1 parent ae07cb6 commit 624463e

File tree

1 file changed

+174
-24
lines changed

1 file changed

+174
-24
lines changed

examples/models/voxtral/multimodal.cpp

Lines changed: 174 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212

1313
#include <gflags/gflags.h>
1414

15+
#include <executorch/extension/module/module.h>
16+
#include <executorch/extension/tensor/tensor_ptr_maker.h>
17+
#include <executorch/runtime/core/evalue.h>
18+
1519
#include <executorch/extension/llm/runner/audio.h>
1620
#include <executorch/extension/llm/runner/image.h>
1721
#include <executorch/extension/llm/runner/llm_runner_helper.h>
@@ -36,6 +40,11 @@ DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
3640

3741
DEFINE_string(audio_path, "", "Path to input audio file.");
3842

43+
DEFINE_string(
44+
processor_path,
45+
"",
46+
"Path to processor .pte file for raw audio processing.");
47+
3948
DEFINE_double(
4049
temperature,
4150
0.8f,
@@ -50,16 +59,48 @@ DEFINE_bool(warmup, false, "Whether to run a warmup run.");
5059

5160
namespace {
5261

62+
using ::executorch::extension::from_blob;
63+
using ::executorch::extension::Module;
5364
using ::executorch::extension::llm::Image;
5465
using ::executorch::extension::llm::make_image_input;
5566
using ::executorch::extension::llm::make_text_input;
5667
using ::executorch::extension::llm::MultimodalInput;
68+
using ::executorch::runtime::EValue;
5769

5870
bool ends_with(const std::string& str, const std::string& suffix) {
5971
return str.size() >= suffix.size() &&
6072
str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
6173
}
6274

75+
/**
76+
* @brief Loads float data from a binary file
77+
*
78+
* @param audio_path Path to the binary audio file (.bin)
79+
* @return Vector of float data loaded from the file
80+
* @throws std::runtime_error if file loading fails
81+
*/
82+
std::vector<float> loadBinaryFloatData(const std::string& audio_path) {
83+
std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
84+
if (!f.is_open()) {
85+
ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
86+
throw std::runtime_error("Failed to open audio file");
87+
}
88+
89+
std::size_t n_floats =
90+
f.tellg() / sizeof(float); // Number of floats in the audio file
91+
f.seekg(0, std::ios::beg);
92+
93+
std::vector<float> audio_data(n_floats);
94+
f.read(
95+
reinterpret_cast<char*>(audio_data.data()),
96+
audio_data.size() * sizeof(float));
97+
f.close();
98+
99+
ET_LOG(
100+
Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
101+
return audio_data;
102+
}
103+
63104
/**
64105
* @brief Loads preprocessed audio data from a binary file
65106
*
@@ -73,22 +114,19 @@ bool ends_with(const std::string& str, const std::string& suffix) {
73114
* @return MultimodalInput containing the loaded audio data
74115
*/
75116
MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
76-
std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
117+
std::vector<float> audio_data = loadBinaryFloatData(audio_path);
118+
77119
int32_t n_bins = 128;
78120
int32_t n_frames = 3000;
79-
std::size_t n_floats =
80-
f.tellg() / sizeof(float); // Number of floats in the audio file.
81-
f.seekg(0, std::ios::beg);
121+
122+
std::size_t n_floats = audio_data.size();
82123
int32_t batch_size = ceil(
83124
n_floats /
84125
(n_bins * n_frames)); // Batch in increments of n_frames, rounding up.
85-
std::vector<float> audio_data(batch_size * n_bins * n_frames);
86-
f.read(
87-
reinterpret_cast<char*>(audio_data.data()),
88-
audio_data.size() * sizeof(float));
89126

90127
ET_LOG(Info, "audio_data len = %d", audio_data.size());
91128

129+
// Create Audio multimodal input
92130
auto audio = std::make_unique<::executorch::extension::llm::Audio>();
93131
audio->batch_size = batch_size;
94132
audio->n_bins = n_bins;
@@ -100,29 +138,140 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
100138
}
101139

102140
/**
103-
* @brief Processes audio files for multimodal input
141+
* @brief Loads a .bin file into a tensor and processes it using a .pte
142+
* processor
104143
*
105-
* Dispatches audio file processing based on file extension:
106-
* - .bin files: Loads preprocessed mel spectrogram features directly
107-
* - .wav/.mp3 files: Currently unsupported, throws runtime_error
144+
* This function loads raw audio data from a .bin file (similar to
145+
* loadPreprocessedAudio), creates a tensor from it, and then passes it through
146+
* a processor module loaded from a .pte file to generate processed audio
147+
* features.
148+
*
149+
* @param audio_path Path to the .bin audio file
150+
* @param processor_path Path to the .pte processor file
151+
* @return MultimodalInput containing the processed audio data
152+
* @throws std::runtime_error if file loading or processing fails
153+
*/
154+
MultimodalInput processRawAudioFile(
155+
const std::string& audio_path,
156+
const std::string& processor_path) {
157+
if (processor_path.empty()) {
158+
ET_LOG(Error, "Processor path is required for raw audio processing");
159+
throw std::runtime_error(
160+
"Processor path is required for raw audio processing");
161+
}
162+
163+
// Load the audio processor .pte.
164+
std::unique_ptr<Module> processor_module;
165+
try {
166+
processor_module =
167+
std::make_unique<Module>(processor_path, Module::LoadMode::File);
168+
auto load_error = processor_module->load();
169+
if (load_error != ::executorch::runtime::Error::Ok) {
170+
ET_LOG(
171+
Error,
172+
"Failed to load processor module from: %s",
173+
processor_path.c_str());
174+
throw std::runtime_error("Failed to load processor module");
175+
}
176+
} catch (const std::exception& e) {
177+
ET_LOG(Error, "Exception while loading processor module: %s", e.what());
178+
throw std::runtime_error("Exception while loading processor module");
179+
}
180+
181+
// Load the audio data from file.
182+
std::vector<float> audio_data = loadBinaryFloatData(audio_path);
183+
184+
// Execute the processor
185+
std::vector<executorch::aten::SizesType> tensor_shape = {
186+
static_cast<executorch::aten::SizesType>(audio_data.size())};
187+
auto input_tensor = from_blob(
188+
audio_data.data(), tensor_shape, ::executorch::aten::ScalarType::Float);
189+
190+
ET_LOG(Info, "Processing audio through processor module...");
191+
auto result = processor_module->execute("forward", input_tensor);
192+
if (!result.ok()) {
193+
ET_LOG(Error, "Failed to execute processor's forward method");
194+
throw std::runtime_error("Failed to execute processor forward method");
195+
}
196+
197+
auto outputs = result.get();
198+
if (outputs.empty()) {
199+
ET_LOG(Error, "Processor returned no outputs");
200+
throw std::runtime_error("Processor returned no outputs");
201+
}
202+
203+
// Extract processed audio features
204+
const auto& processed_tensor = outputs[0].toTensor();
205+
const float* processed_data = processed_tensor.const_data_ptr<float>();
206+
const auto& sizes = processed_tensor.sizes();
207+
208+
ET_LOG(
209+
Info,
210+
"Processed audio tensor shape: [%d, %d, %d]",
211+
static_cast<int>(sizes[0]),
212+
static_cast<int>(sizes[1]),
213+
static_cast<int>(sizes[2]));
214+
215+
// Create Audio multimodal input from processed features
216+
auto processed_audio =
217+
std::make_unique<::executorch::extension::llm::Audio>();
218+
processed_audio->batch_size =
219+
static_cast<int32_t>(sizes[0]); // Note: batching for s > 30 doesn't work
220+
// yet, so this will just be = 1.
221+
processed_audio->n_bins = static_cast<int32_t>(sizes[1]);
222+
processed_audio->n_frames =
223+
static_cast<int32_t>(sizes[2]); // And this will just be = 3000.
224+
225+
size_t total_elements = processed_audio->batch_size *
226+
processed_audio->n_bins * processed_audio->n_frames;
227+
processed_audio->data.resize(total_elements * sizeof(float));
228+
std::memcpy(
229+
processed_audio->data.data(),
230+
processed_data,
231+
total_elements * sizeof(float));
232+
233+
ET_LOG(
234+
Info,
235+
"Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d",
236+
processed_audio->batch_size,
237+
processed_audio->n_bins,
238+
processed_audio->n_frames);
239+
240+
return ::executorch::extension::llm::make_audio_input(
241+
std::move(*processed_audio));
242+
}
243+
244+
/**
245+
* @brief Processes audio files for multimodal input
108246
*
109-
* This function provides a interface for different audio input formats
110-
* and can be extended to support raw audio processing in the future.
247+
* Dispatches audio file processing based on file extension and processor
248+
* availability:
249+
* - .bin files with processor: Loads raw audio from .bin and processes through
250+
* processor
251+
* - .bin files without processor: Loads preprocessed mel spectrogram features
252+
* directly
111253
*
112-
* @param audio_path Path to the audio file
254+
* @param audio_path Path to the audio file (.bin)
255+
* @param processor_path Path to the processor .pte file (optional)
113256
* @return MultimodalInput containing the processed audio data
114257
* @throws std::runtime_error if file format is unsupported or processing fails
115258
*/
116-
MultimodalInput processAudioFile(const std::string& audio_path) {
259+
MultimodalInput processAudioFile(
260+
const std::string& audio_path,
261+
const std::string& processor_path = "") {
117262
if (ends_with(audio_path, ".bin")) {
118-
// Current behavior - load preprocessed audio stored as a binary file.
119-
return loadPreprocessedAudio(audio_path);
120-
} else if (ends_with(audio_path, ".wav") || ends_with(audio_path, ".mp3")) {
121-
// New: Process raw audio files - unsupported for now
122-
ET_LOG(Error, "Raw audio file processing (.wav/.mp3) is not yet supported");
123-
throw std::runtime_error("Raw audio file processing not supported");
263+
if (!processor_path.empty()) {
264+
// Process raw audio from .bin file through the processor
265+
return processRawAudioFile(audio_path, processor_path);
266+
} else {
267+
// Load preprocessed audio stored as a binary file (existing behavior)
268+
return loadPreprocessedAudio(audio_path);
269+
}
124270
} else {
125-
ET_LOG(Error, "Unsupported audio file format: %s", audio_path.c_str());
271+
ET_LOG(
272+
Error,
273+
"Unsupported audio file format: %s (only .bin files are supported)",
274+
audio_path.c_str());
126275
throw std::runtime_error("Unsupported audio file format");
127276
}
128277
}
@@ -137,6 +286,7 @@ int32_t main(int32_t argc, char** argv) {
137286
const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
138287
const char* prompt = FLAGS_prompt.c_str();
139288
const char* audio_path = FLAGS_audio_path.c_str();
289+
const char* processor_path = FLAGS_processor_path.c_str();
140290
float temperature = FLAGS_temperature;
141291
int32_t cpu_threads = FLAGS_cpu_threads;
142292
bool warmup = FLAGS_warmup;
@@ -184,7 +334,7 @@ int32_t main(int32_t argc, char** argv) {
184334
inputs.emplace_back(make_text_input("<s>[INST][BEGIN_AUDIO]"));
185335

186336
// 2. Add audio input
187-
inputs.emplace_back(processAudioFile(audio_path));
337+
inputs.emplace_back(processAudioFile(audio_path, processor_path));
188338

189339
// 3. Add text input (the actual user-submitted prompt)
190340
inputs.emplace_back(make_text_input(std::string(prompt) + "[/INST]"));

0 commit comments

Comments
 (0)