12
12
13
13
#include < gflags/gflags.h>
14
14
15
+ #include < executorch/extension/module/module.h>
16
+ #include < executorch/extension/tensor/tensor_ptr_maker.h>
17
+ #include < executorch/runtime/core/evalue.h>
18
+
15
19
#include < executorch/extension/llm/runner/audio.h>
16
20
#include < executorch/extension/llm/runner/image.h>
17
21
#include < executorch/extension/llm/runner/llm_runner_helper.h>
@@ -36,6 +40,11 @@ DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
36
40
37
41
DEFINE_string (audio_path, " " , " Path to input audio file." );
38
42
43
+ DEFINE_string (
44
+ processor_path,
45
+ " " ,
46
+ " Path to processor .pte file for raw audio processing." );
47
+
39
48
DEFINE_double (
40
49
temperature,
41
50
0 .8f ,
@@ -50,16 +59,48 @@ DEFINE_bool(warmup, false, "Whether to run a warmup run.");
50
59
51
60
namespace {
52
61
62
+ using ::executorch::extension::from_blob;
63
+ using ::executorch::extension::Module;
53
64
using ::executorch::extension::llm::Image;
54
65
using ::executorch::extension::llm::make_image_input;
55
66
using ::executorch::extension::llm::make_text_input;
56
67
using ::executorch::extension::llm::MultimodalInput;
68
+ using ::executorch::runtime::EValue;
57
69
58
70
bool ends_with (const std::string& str, const std::string& suffix) {
59
71
return str.size () >= suffix.size () &&
60
72
str.compare (str.size () - suffix.size (), suffix.size (), suffix) == 0 ;
61
73
}
62
74
75
+ /* *
76
+ * @brief Loads float data from a binary file
77
+ *
78
+ * @param audio_path Path to the binary audio file (.bin)
79
+ * @return Vector of float data loaded from the file
80
+ * @throws std::runtime_error if file loading fails
81
+ */
82
+ std::vector<float > loadBinaryFloatData (const std::string& audio_path) {
83
+ std::ifstream f (audio_path, std::ios::binary | std::ios::ate);
84
+ if (!f.is_open ()) {
85
+ ET_LOG (Error, " Failed to open audio file: %s" , audio_path.c_str ());
86
+ throw std::runtime_error (" Failed to open audio file" );
87
+ }
88
+
89
+ std::size_t n_floats =
90
+ f.tellg () / sizeof (float ); // Number of floats in the audio file
91
+ f.seekg (0 , std::ios::beg);
92
+
93
+ std::vector<float > audio_data (n_floats);
94
+ f.read (
95
+ reinterpret_cast <char *>(audio_data.data ()),
96
+ audio_data.size () * sizeof (float ));
97
+ f.close ();
98
+
99
+ ET_LOG (
100
+ Info, " Loaded .bin file: %s, %zu floats" , audio_path.c_str (), n_floats);
101
+ return audio_data;
102
+ }
103
+
63
104
/* *
64
105
* @brief Loads preprocessed audio data from a binary file
65
106
*
@@ -73,22 +114,19 @@ bool ends_with(const std::string& str, const std::string& suffix) {
73
114
* @return MultimodalInput containing the loaded audio data
74
115
*/
75
116
MultimodalInput loadPreprocessedAudio (const std::string& audio_path) {
76
- std::ifstream f (audio_path, std::ios::binary | std::ios::ate);
117
+ std::vector<float > audio_data = loadBinaryFloatData (audio_path);
118
+
77
119
int32_t n_bins = 128 ;
78
120
int32_t n_frames = 3000 ;
79
- std::size_t n_floats =
80
- f.tellg () / sizeof (float ); // Number of floats in the audio file.
81
- f.seekg (0 , std::ios::beg);
121
+
122
+ std::size_t n_floats = audio_data.size ();
82
123
int32_t batch_size = ceil (
83
124
n_floats /
84
125
(n_bins * n_frames)); // Batch in increments of n_frames, rounding up.
85
- std::vector<float > audio_data (batch_size * n_bins * n_frames);
86
- f.read (
87
- reinterpret_cast <char *>(audio_data.data ()),
88
- audio_data.size () * sizeof (float ));
89
126
90
127
ET_LOG (Info, " audio_data len = %d" , audio_data.size ());
91
128
129
+ // Create Audio multimodal input
92
130
auto audio = std::make_unique<::executorch::extension::llm::Audio>();
93
131
audio->batch_size = batch_size;
94
132
audio->n_bins = n_bins;
@@ -100,29 +138,140 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
100
138
}
101
139
102
140
/* *
103
- * @brief Processes audio files for multimodal input
141
+ * @brief Loads a .bin file into a tensor and processes it using a .pte
142
+ * processor
104
143
*
105
- * Dispatches audio file processing based on file extension:
106
- * - .bin files: Loads preprocessed mel spectrogram features directly
107
- * - .wav/.mp3 files: Currently unsupported, throws runtime_error
144
+ * This function loads raw audio data from a .bin file (similar to
145
+ * loadPreprocessedAudio), creates a tensor from it, and then passes it through
146
+ * a processor module loaded from a .pte file to generate processed audio
147
+ * features.
148
+ *
149
+ * @param audio_path Path to the .bin audio file
150
+ * @param processor_path Path to the .pte processor file
151
+ * @return MultimodalInput containing the processed audio data
152
+ * @throws std::runtime_error if file loading or processing fails
153
+ */
154
+ MultimodalInput processRawAudioFile (
155
+ const std::string& audio_path,
156
+ const std::string& processor_path) {
157
+ if (processor_path.empty ()) {
158
+ ET_LOG (Error, " Processor path is required for raw audio processing" );
159
+ throw std::runtime_error (
160
+ " Processor path is required for raw audio processing" );
161
+ }
162
+
163
+ // Load the audio processor .pte.
164
+ std::unique_ptr<Module> processor_module;
165
+ try {
166
+ processor_module =
167
+ std::make_unique<Module>(processor_path, Module::LoadMode::File);
168
+ auto load_error = processor_module->load ();
169
+ if (load_error != ::executorch::runtime::Error::Ok) {
170
+ ET_LOG (
171
+ Error,
172
+ " Failed to load processor module from: %s" ,
173
+ processor_path.c_str ());
174
+ throw std::runtime_error (" Failed to load processor module" );
175
+ }
176
+ } catch (const std::exception& e) {
177
+ ET_LOG (Error, " Exception while loading processor module: %s" , e.what ());
178
+ throw std::runtime_error (" Exception while loading processor module" );
179
+ }
180
+
181
+ // Load the audio data from file.
182
+ std::vector<float > audio_data = loadBinaryFloatData (audio_path);
183
+
184
+ // Execute the processor
185
+ std::vector<executorch::aten::SizesType> tensor_shape = {
186
+ static_cast <executorch::aten::SizesType>(audio_data.size ())};
187
+ auto input_tensor = from_blob (
188
+ audio_data.data (), tensor_shape, ::executorch::aten::ScalarType::Float);
189
+
190
+ ET_LOG (Info, " Processing audio through processor module..." );
191
+ auto result = processor_module->execute (" forward" , input_tensor);
192
+ if (!result.ok ()) {
193
+ ET_LOG (Error, " Failed to execute processor's forward method" );
194
+ throw std::runtime_error (" Failed to execute processor forward method" );
195
+ }
196
+
197
+ auto outputs = result.get ();
198
+ if (outputs.empty ()) {
199
+ ET_LOG (Error, " Processor returned no outputs" );
200
+ throw std::runtime_error (" Processor returned no outputs" );
201
+ }
202
+
203
+ // Extract processed audio features
204
+ const auto & processed_tensor = outputs[0 ].toTensor ();
205
+ const float * processed_data = processed_tensor.const_data_ptr <float >();
206
+ const auto & sizes = processed_tensor.sizes ();
207
+
208
+ ET_LOG (
209
+ Info,
210
+ " Processed audio tensor shape: [%d, %d, %d]" ,
211
+ static_cast <int >(sizes[0 ]),
212
+ static_cast <int >(sizes[1 ]),
213
+ static_cast <int >(sizes[2 ]));
214
+
215
+ // Create Audio multimodal input from processed features
216
+ auto processed_audio =
217
+ std::make_unique<::executorch::extension::llm::Audio>();
218
+ processed_audio->batch_size =
219
+ static_cast <int32_t >(sizes[0 ]); // Note: batching for s > 30 doesn't work
220
+ // yet, so this will just be = 1.
221
+ processed_audio->n_bins = static_cast <int32_t >(sizes[1 ]);
222
+ processed_audio->n_frames =
223
+ static_cast <int32_t >(sizes[2 ]); // And this will just be = 3000.
224
+
225
+ size_t total_elements = processed_audio->batch_size *
226
+ processed_audio->n_bins * processed_audio->n_frames ;
227
+ processed_audio->data .resize (total_elements * sizeof (float ));
228
+ std::memcpy (
229
+ processed_audio->data .data (),
230
+ processed_data,
231
+ total_elements * sizeof (float ));
232
+
233
+ ET_LOG (
234
+ Info,
235
+ " Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d" ,
236
+ processed_audio->batch_size ,
237
+ processed_audio->n_bins ,
238
+ processed_audio->n_frames );
239
+
240
+ return ::executorch::extension::llm::make_audio_input (
241
+ std::move (*processed_audio));
242
+ }
243
+
244
+ /* *
245
+ * @brief Processes audio files for multimodal input
108
246
*
109
- * This function provides a interface for different audio input formats
110
- * and can be extended to support raw audio processing in the future.
247
+ * Dispatches audio file processing based on file extension and processor
248
+ * availability:
249
+ * - .bin files with processor: Loads raw audio from .bin and processes through
250
+ * processor
251
+ * - .bin files without processor: Loads preprocessed mel spectrogram features
252
+ * directly
111
253
*
112
- * @param audio_path Path to the audio file
254
+ * @param audio_path Path to the audio file (.bin)
255
+ * @param processor_path Path to the processor .pte file (optional)
113
256
* @return MultimodalInput containing the processed audio data
114
257
* @throws std::runtime_error if file format is unsupported or processing fails
115
258
*/
116
- MultimodalInput processAudioFile (const std::string& audio_path) {
259
+ MultimodalInput processAudioFile (
260
+ const std::string& audio_path,
261
+ const std::string& processor_path = " " ) {
117
262
if (ends_with (audio_path, " .bin" )) {
118
- // Current behavior - load preprocessed audio stored as a binary file.
119
- return loadPreprocessedAudio (audio_path);
120
- } else if (ends_with (audio_path, " .wav" ) || ends_with (audio_path, " .mp3" )) {
121
- // New: Process raw audio files - unsupported for now
122
- ET_LOG (Error, " Raw audio file processing (.wav/.mp3) is not yet supported" );
123
- throw std::runtime_error (" Raw audio file processing not supported" );
263
+ if (!processor_path.empty ()) {
264
+ // Process raw audio from .bin file through the processor
265
+ return processRawAudioFile (audio_path, processor_path);
266
+ } else {
267
+ // Load preprocessed audio stored as a binary file (existing behavior)
268
+ return loadPreprocessedAudio (audio_path);
269
+ }
124
270
} else {
125
- ET_LOG (Error, " Unsupported audio file format: %s" , audio_path.c_str ());
271
+ ET_LOG (
272
+ Error,
273
+ " Unsupported audio file format: %s (only .bin files are supported)" ,
274
+ audio_path.c_str ());
126
275
throw std::runtime_error (" Unsupported audio file format" );
127
276
}
128
277
}
@@ -137,6 +286,7 @@ int32_t main(int32_t argc, char** argv) {
137
286
const char * tokenizer_path = FLAGS_tokenizer_path.c_str ();
138
287
const char * prompt = FLAGS_prompt.c_str ();
139
288
const char * audio_path = FLAGS_audio_path.c_str ();
289
+ const char * processor_path = FLAGS_processor_path.c_str ();
140
290
float temperature = FLAGS_temperature;
141
291
int32_t cpu_threads = FLAGS_cpu_threads;
142
292
bool warmup = FLAGS_warmup;
@@ -184,7 +334,7 @@ int32_t main(int32_t argc, char** argv) {
184
334
inputs.emplace_back (make_text_input (" <s>[INST][BEGIN_AUDIO]" ));
185
335
186
336
// 2. Add audio input
187
- inputs.emplace_back (processAudioFile (audio_path));
337
+ inputs.emplace_back (processAudioFile (audio_path, processor_path ));
188
338
189
339
// 3. Add text input (the actual user-submitted prompt)
190
340
inputs.emplace_back (make_text_input (std::string (prompt) + " [/INST]" ));
0 commit comments