Skip to content

Commit a2d0e1c

Browse files
committed
Revert "Include audio preprocessing for raw audio tensor (#13855)"
This reverts commit 624463e.
1 parent aa08df5 commit a2d0e1c

File tree

1 file changed

+24
-174
lines changed

1 file changed

+24
-174
lines changed

examples/models/voxtral/multimodal.cpp

Lines changed: 24 additions & 174 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,6 @@
1212

1313
#include <gflags/gflags.h>
1414

15-
#include <executorch/extension/module/module.h>
16-
#include <executorch/extension/tensor/tensor_ptr_maker.h>
17-
#include <executorch/runtime/core/evalue.h>
18-
1915
#include <executorch/extension/llm/runner/audio.h>
2016
#include <executorch/extension/llm/runner/image.h>
2117
#include <executorch/extension/llm/runner/llm_runner_helper.h>
@@ -40,11 +36,6 @@ DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
4036

4137
DEFINE_string(audio_path, "", "Path to input audio file.");
4238

43-
DEFINE_string(
44-
processor_path,
45-
"",
46-
"Path to processor .pte file for raw audio processing.");
47-
4839
DEFINE_double(
4940
temperature,
5041
0.8f,
@@ -59,48 +50,16 @@ DEFINE_bool(warmup, false, "Whether to run a warmup run.");
5950

6051
namespace {
6152

62-
using ::executorch::extension::from_blob;
63-
using ::executorch::extension::Module;
6453
using ::executorch::extension::llm::Image;
6554
using ::executorch::extension::llm::make_image_input;
6655
using ::executorch::extension::llm::make_text_input;
6756
using ::executorch::extension::llm::MultimodalInput;
68-
using ::executorch::runtime::EValue;
6957

7058
bool ends_with(const std::string& str, const std::string& suffix) {
7159
return str.size() >= suffix.size() &&
7260
str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
7361
}
7462

75-
/**
76-
* @brief Loads float data from a binary file
77-
*
78-
* @param audio_path Path to the binary audio file (.bin)
79-
* @return Vector of float data loaded from the file
80-
* @throws std::runtime_error if file loading fails
81-
*/
82-
std::vector<float> loadBinaryFloatData(const std::string& audio_path) {
83-
std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
84-
if (!f.is_open()) {
85-
ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
86-
throw std::runtime_error("Failed to open audio file");
87-
}
88-
89-
std::size_t n_floats =
90-
f.tellg() / sizeof(float); // Number of floats in the audio file
91-
f.seekg(0, std::ios::beg);
92-
93-
std::vector<float> audio_data(n_floats);
94-
f.read(
95-
reinterpret_cast<char*>(audio_data.data()),
96-
audio_data.size() * sizeof(float));
97-
f.close();
98-
99-
ET_LOG(
100-
Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
101-
return audio_data;
102-
}
103-
10463
/**
10564
* @brief Loads preprocessed audio data from a binary file
10665
*
@@ -114,19 +73,22 @@ std::vector<float> loadBinaryFloatData(const std::string& audio_path) {
11473
* @return MultimodalInput containing the loaded audio data
11574
*/
11675
MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
117-
std::vector<float> audio_data = loadBinaryFloatData(audio_path);
118-
76+
std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
11977
int32_t n_bins = 128;
12078
int32_t n_frames = 3000;
121-
122-
std::size_t n_floats = audio_data.size();
79+
std::size_t n_floats =
80+
f.tellg() / sizeof(float); // Number of floats in the audio file.
81+
f.seekg(0, std::ios::beg);
12382
int32_t batch_size = ceil(
12483
n_floats /
12584
(n_bins * n_frames)); // Batch in increments of n_frames, rounding up.
85+
std::vector<float> audio_data(batch_size * n_bins * n_frames);
86+
f.read(
87+
reinterpret_cast<char*>(audio_data.data()),
88+
audio_data.size() * sizeof(float));
12689

12790
ET_LOG(Info, "audio_data len = %d", audio_data.size());
12891

129-
// Create Audio multimodal input
13092
auto audio = std::make_unique<::executorch::extension::llm::Audio>();
13193
audio->batch_size = batch_size;
13294
audio->n_bins = n_bins;
@@ -137,141 +99,30 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
13799
return ::executorch::extension::llm::make_audio_input(std::move(*audio));
138100
}
139101

140-
/**
141-
* @brief Loads a .bin file into a tensor and processes it using a .pte
142-
* processor
143-
*
144-
* This function loads raw audio data from a .bin file (similar to
145-
* loadPreprocessedAudio), creates a tensor from it, and then passes it through
146-
* a processor module loaded from a .pte file to generate processed audio
147-
* features.
148-
*
149-
* @param audio_path Path to the .bin audio file
150-
* @param processor_path Path to the .pte processor file
151-
* @return MultimodalInput containing the processed audio data
152-
* @throws std::runtime_error if file loading or processing fails
153-
*/
154-
MultimodalInput processRawAudioFile(
155-
const std::string& audio_path,
156-
const std::string& processor_path) {
157-
if (processor_path.empty()) {
158-
ET_LOG(Error, "Processor path is required for raw audio processing");
159-
throw std::runtime_error(
160-
"Processor path is required for raw audio processing");
161-
}
162-
163-
// Load the audio processor .pte.
164-
std::unique_ptr<Module> processor_module;
165-
try {
166-
processor_module =
167-
std::make_unique<Module>(processor_path, Module::LoadMode::File);
168-
auto load_error = processor_module->load();
169-
if (load_error != ::executorch::runtime::Error::Ok) {
170-
ET_LOG(
171-
Error,
172-
"Failed to load processor module from: %s",
173-
processor_path.c_str());
174-
throw std::runtime_error("Failed to load processor module");
175-
}
176-
} catch (const std::exception& e) {
177-
ET_LOG(Error, "Exception while loading processor module: %s", e.what());
178-
throw std::runtime_error("Exception while loading processor module");
179-
}
180-
181-
// Load the audio data from file.
182-
std::vector<float> audio_data = loadBinaryFloatData(audio_path);
183-
184-
// Execute the processor
185-
std::vector<executorch::aten::SizesType> tensor_shape = {
186-
static_cast<executorch::aten::SizesType>(audio_data.size())};
187-
auto input_tensor = from_blob(
188-
audio_data.data(), tensor_shape, ::executorch::aten::ScalarType::Float);
189-
190-
ET_LOG(Info, "Processing audio through processor module...");
191-
auto result = processor_module->execute("forward", input_tensor);
192-
if (!result.ok()) {
193-
ET_LOG(Error, "Failed to execute processor's forward method");
194-
throw std::runtime_error("Failed to execute processor forward method");
195-
}
196-
197-
auto outputs = result.get();
198-
if (outputs.empty()) {
199-
ET_LOG(Error, "Processor returned no outputs");
200-
throw std::runtime_error("Processor returned no outputs");
201-
}
202-
203-
// Extract processed audio features
204-
const auto& processed_tensor = outputs[0].toTensor();
205-
const float* processed_data = processed_tensor.const_data_ptr<float>();
206-
const auto& sizes = processed_tensor.sizes();
207-
208-
ET_LOG(
209-
Info,
210-
"Processed audio tensor shape: [%d, %d, %d]",
211-
static_cast<int>(sizes[0]),
212-
static_cast<int>(sizes[1]),
213-
static_cast<int>(sizes[2]));
214-
215-
// Create Audio multimodal input from processed features
216-
auto processed_audio =
217-
std::make_unique<::executorch::extension::llm::Audio>();
218-
processed_audio->batch_size =
219-
static_cast<int32_t>(sizes[0]); // Note: batching for s > 30 doesn't work
220-
// yet, so this will just be = 1.
221-
processed_audio->n_bins = static_cast<int32_t>(sizes[1]);
222-
processed_audio->n_frames =
223-
static_cast<int32_t>(sizes[2]); // And this will just be = 3000.
224-
225-
size_t total_elements = processed_audio->batch_size *
226-
processed_audio->n_bins * processed_audio->n_frames;
227-
processed_audio->data.resize(total_elements * sizeof(float));
228-
std::memcpy(
229-
processed_audio->data.data(),
230-
processed_data,
231-
total_elements * sizeof(float));
232-
233-
ET_LOG(
234-
Info,
235-
"Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d",
236-
processed_audio->batch_size,
237-
processed_audio->n_bins,
238-
processed_audio->n_frames);
239-
240-
return ::executorch::extension::llm::make_audio_input(
241-
std::move(*processed_audio));
242-
}
243-
244102
/**
245103
* @brief Processes audio files for multimodal input
246104
*
247-
* Dispatches audio file processing based on file extension and processor
248-
* availability:
249-
* - .bin files with processor: Loads raw audio from .bin and processes through
250-
* processor
251-
* - .bin files without processor: Loads preprocessed mel spectrogram features
252-
* directly
105+
* Dispatches audio file processing based on file extension:
106+
* - .bin files: Loads preprocessed mel spectrogram features directly
107+
* - .wav/.mp3 files: Currently unsupported, throws runtime_error
108+
*
109+
* This function provides a interface for different audio input formats
110+
* and can be extended to support raw audio processing in the future.
253111
*
254-
* @param audio_path Path to the audio file (.bin)
255-
* @param processor_path Path to the processor .pte file (optional)
112+
* @param audio_path Path to the audio file
256113
* @return MultimodalInput containing the processed audio data
257114
* @throws std::runtime_error if file format is unsupported or processing fails
258115
*/
259-
MultimodalInput processAudioFile(
260-
const std::string& audio_path,
261-
const std::string& processor_path = "") {
116+
MultimodalInput processAudioFile(const std::string& audio_path) {
262117
if (ends_with(audio_path, ".bin")) {
263-
if (!processor_path.empty()) {
264-
// Process raw audio from .bin file through the processor
265-
return processRawAudioFile(audio_path, processor_path);
266-
} else {
267-
// Load preprocessed audio stored as a binary file (existing behavior)
268-
return loadPreprocessedAudio(audio_path);
269-
}
118+
// Current behavior - load preprocessed audio stored as a binary file.
119+
return loadPreprocessedAudio(audio_path);
120+
} else if (ends_with(audio_path, ".wav") || ends_with(audio_path, ".mp3")) {
121+
// New: Process raw audio files - unsupported for now
122+
ET_LOG(Error, "Raw audio file processing (.wav/.mp3) is not yet supported");
123+
throw std::runtime_error("Raw audio file processing not supported");
270124
} else {
271-
ET_LOG(
272-
Error,
273-
"Unsupported audio file format: %s (only .bin files are supported)",
274-
audio_path.c_str());
125+
ET_LOG(Error, "Unsupported audio file format: %s", audio_path.c_str());
275126
throw std::runtime_error("Unsupported audio file format");
276127
}
277128
}
@@ -286,7 +137,6 @@ int32_t main(int32_t argc, char** argv) {
286137
const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
287138
const char* prompt = FLAGS_prompt.c_str();
288139
const char* audio_path = FLAGS_audio_path.c_str();
289-
const char* processor_path = FLAGS_processor_path.c_str();
290140
float temperature = FLAGS_temperature;
291141
int32_t cpu_threads = FLAGS_cpu_threads;
292142
bool warmup = FLAGS_warmup;
@@ -334,7 +184,7 @@ int32_t main(int32_t argc, char** argv) {
334184
inputs.emplace_back(make_text_input("<s>[INST][BEGIN_AUDIO]"));
335185

336186
// 2. Add audio input
337-
inputs.emplace_back(processAudioFile(audio_path, processor_path));
187+
inputs.emplace_back(processAudioFile(audio_path));
338188

339189
// 3. Add text input (the actual user-submitted prompt)
340190
inputs.emplace_back(make_text_input(std::string(prompt) + "[/INST]"));

0 commit comments

Comments
 (0)