1212
1313#include < gflags/gflags.h>
1414
15- #include < executorch/extension/module/module.h>
16- #include < executorch/extension/tensor/tensor_ptr_maker.h>
17- #include < executorch/runtime/core/evalue.h>
18-
1915#include < executorch/extension/llm/runner/audio.h>
2016#include < executorch/extension/llm/runner/image.h>
2117#include < executorch/extension/llm/runner/llm_runner_helper.h>
@@ -40,11 +36,6 @@ DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
4036
4137DEFINE_string (audio_path, " " , " Path to input audio file." );
4238
43- DEFINE_string (
44- processor_path,
45- " " ,
46- " Path to processor .pte file for raw audio processing." );
47-
4839DEFINE_double (
4940 temperature,
5041 0 .8f ,
@@ -59,48 +50,16 @@ DEFINE_bool(warmup, false, "Whether to run a warmup run.");
5950
6051namespace {
6152
62- using ::executorch::extension::from_blob;
63- using ::executorch::extension::Module;
6453using ::executorch::extension::llm::Image;
6554using ::executorch::extension::llm::make_image_input;
6655using ::executorch::extension::llm::make_text_input;
6756using ::executorch::extension::llm::MultimodalInput;
68- using ::executorch::runtime::EValue;
6957
7058bool ends_with (const std::string& str, const std::string& suffix) {
7159 return str.size () >= suffix.size () &&
7260 str.compare (str.size () - suffix.size (), suffix.size (), suffix) == 0 ;
7361}
7462
75- /* *
76- * @brief Loads float data from a binary file
77- *
78- * @param audio_path Path to the binary audio file (.bin)
79- * @return Vector of float data loaded from the file
80- * @throws std::runtime_error if file loading fails
81- */
82- std::vector<float > loadBinaryFloatData (const std::string& audio_path) {
83- std::ifstream f (audio_path, std::ios::binary | std::ios::ate);
84- if (!f.is_open ()) {
85- ET_LOG (Error, " Failed to open audio file: %s" , audio_path.c_str ());
86- throw std::runtime_error (" Failed to open audio file" );
87- }
88-
89- std::size_t n_floats =
90- f.tellg () / sizeof (float ); // Number of floats in the audio file
91- f.seekg (0 , std::ios::beg);
92-
93- std::vector<float > audio_data (n_floats);
94- f.read (
95- reinterpret_cast <char *>(audio_data.data ()),
96- audio_data.size () * sizeof (float ));
97- f.close ();
98-
99- ET_LOG (
100- Info, " Loaded .bin file: %s, %zu floats" , audio_path.c_str (), n_floats);
101- return audio_data;
102- }
103-
10463/* *
10564 * @brief Loads preprocessed audio data from a binary file
10665 *
@@ -114,19 +73,22 @@ std::vector<float> loadBinaryFloatData(const std::string& audio_path) {
11473 * @return MultimodalInput containing the loaded audio data
11574 */
11675MultimodalInput loadPreprocessedAudio (const std::string& audio_path) {
117- std::vector<float > audio_data = loadBinaryFloatData (audio_path);
118-
76+ std::ifstream f (audio_path, std::ios::binary | std::ios::ate);
11977 int32_t n_bins = 128 ;
12078 int32_t n_frames = 3000 ;
121-
122- std::size_t n_floats = audio_data.size ();
79+ std::size_t n_floats =
80+ f.tellg () / sizeof (float ); // Number of floats in the audio file.
81+ f.seekg (0 , std::ios::beg);
12382 int32_t batch_size = ceil (
12483 n_floats /
12584 (n_bins * n_frames)); // Batch in increments of n_frames, rounding up.
85+ std::vector<float > audio_data (batch_size * n_bins * n_frames);
86+ f.read (
87+ reinterpret_cast <char *>(audio_data.data ()),
88+ audio_data.size () * sizeof (float ));
12689
12790 ET_LOG (Info, " audio_data len = %d" , audio_data.size ());
12891
129- // Create Audio multimodal input
13092 auto audio = std::make_unique<::executorch::extension::llm::Audio>();
13193 audio->batch_size = batch_size;
13294 audio->n_bins = n_bins;
@@ -137,141 +99,30 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
13799 return ::executorch::extension::llm::make_audio_input (std::move (*audio));
138100}
139101
140- /* *
141- * @brief Loads a .bin file into a tensor and processes it using a .pte
142- * processor
143- *
144- * This function loads raw audio data from a .bin file (similar to
145- * loadPreprocessedAudio), creates a tensor from it, and then passes it through
146- * a processor module loaded from a .pte file to generate processed audio
147- * features.
148- *
149- * @param audio_path Path to the .bin audio file
150- * @param processor_path Path to the .pte processor file
151- * @return MultimodalInput containing the processed audio data
152- * @throws std::runtime_error if file loading or processing fails
153- */
154- MultimodalInput processRawAudioFile (
155- const std::string& audio_path,
156- const std::string& processor_path) {
157- if (processor_path.empty ()) {
158- ET_LOG (Error, " Processor path is required for raw audio processing" );
159- throw std::runtime_error (
160- " Processor path is required for raw audio processing" );
161- }
162-
163- // Load the audio processor .pte.
164- std::unique_ptr<Module> processor_module;
165- try {
166- processor_module =
167- std::make_unique<Module>(processor_path, Module::LoadMode::File);
168- auto load_error = processor_module->load ();
169- if (load_error != ::executorch::runtime::Error::Ok) {
170- ET_LOG (
171- Error,
172- " Failed to load processor module from: %s" ,
173- processor_path.c_str ());
174- throw std::runtime_error (" Failed to load processor module" );
175- }
176- } catch (const std::exception& e) {
177- ET_LOG (Error, " Exception while loading processor module: %s" , e.what ());
178- throw std::runtime_error (" Exception while loading processor module" );
179- }
180-
181- // Load the audio data from file.
182- std::vector<float > audio_data = loadBinaryFloatData (audio_path);
183-
184- // Execute the processor
185- std::vector<executorch::aten::SizesType> tensor_shape = {
186- static_cast <executorch::aten::SizesType>(audio_data.size ())};
187- auto input_tensor = from_blob (
188- audio_data.data (), tensor_shape, ::executorch::aten::ScalarType::Float);
189-
190- ET_LOG (Info, " Processing audio through processor module..." );
191- auto result = processor_module->execute (" forward" , input_tensor);
192- if (!result.ok ()) {
193- ET_LOG (Error, " Failed to execute processor's forward method" );
194- throw std::runtime_error (" Failed to execute processor forward method" );
195- }
196-
197- auto outputs = result.get ();
198- if (outputs.empty ()) {
199- ET_LOG (Error, " Processor returned no outputs" );
200- throw std::runtime_error (" Processor returned no outputs" );
201- }
202-
203- // Extract processed audio features
204- const auto & processed_tensor = outputs[0 ].toTensor ();
205- const float * processed_data = processed_tensor.const_data_ptr <float >();
206- const auto & sizes = processed_tensor.sizes ();
207-
208- ET_LOG (
209- Info,
210- " Processed audio tensor shape: [%d, %d, %d]" ,
211- static_cast <int >(sizes[0 ]),
212- static_cast <int >(sizes[1 ]),
213- static_cast <int >(sizes[2 ]));
214-
215- // Create Audio multimodal input from processed features
216- auto processed_audio =
217- std::make_unique<::executorch::extension::llm::Audio>();
218- processed_audio->batch_size =
219- static_cast <int32_t >(sizes[0 ]); // Note: batching for s > 30 doesn't work
220- // yet, so this will just be = 1.
221- processed_audio->n_bins = static_cast <int32_t >(sizes[1 ]);
222- processed_audio->n_frames =
223- static_cast <int32_t >(sizes[2 ]); // And this will just be = 3000.
224-
225- size_t total_elements = processed_audio->batch_size *
226- processed_audio->n_bins * processed_audio->n_frames ;
227- processed_audio->data .resize (total_elements * sizeof (float ));
228- std::memcpy (
229- processed_audio->data .data (),
230- processed_data,
231- total_elements * sizeof (float ));
232-
233- ET_LOG (
234- Info,
235- " Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d" ,
236- processed_audio->batch_size ,
237- processed_audio->n_bins ,
238- processed_audio->n_frames );
239-
240- return ::executorch::extension::llm::make_audio_input (
241- std::move (*processed_audio));
242- }
243-
244102/* *
245103 * @brief Processes audio files for multimodal input
246104 *
247- * Dispatches audio file processing based on file extension and processor
248- * availability:
249- * - .bin files with processor: Loads raw audio from .bin and processes through
250- * processor
251- * - .bin files without processor: Loads preprocessed mel spectrogram features
252- * directly
105+ * Dispatches audio file processing based on file extension:
106+ * - .bin files: Loads preprocessed mel spectrogram features directly
107+ * - .wav/.mp3 files: Currently unsupported, throws runtime_error
108+ *
109+ * This function provides a interface for different audio input formats
110+ * and can be extended to support raw audio processing in the future.
253111 *
254- * @param audio_path Path to the audio file (.bin)
255- * @param processor_path Path to the processor .pte file (optional)
112+ * @param audio_path Path to the audio file
256113 * @return MultimodalInput containing the processed audio data
257114 * @throws std::runtime_error if file format is unsupported or processing fails
258115 */
259- MultimodalInput processAudioFile (
260- const std::string& audio_path,
261- const std::string& processor_path = " " ) {
116+ MultimodalInput processAudioFile (const std::string& audio_path) {
262117 if (ends_with (audio_path, " .bin" )) {
263- if (!processor_path.empty ()) {
264- // Process raw audio from .bin file through the processor
265- return processRawAudioFile (audio_path, processor_path);
266- } else {
267- // Load preprocessed audio stored as a binary file (existing behavior)
268- return loadPreprocessedAudio (audio_path);
269- }
118+ // Current behavior - load preprocessed audio stored as a binary file.
119+ return loadPreprocessedAudio (audio_path);
120+ } else if (ends_with (audio_path, " .wav" ) || ends_with (audio_path, " .mp3" )) {
121+ // New: Process raw audio files - unsupported for now
122+ ET_LOG (Error, " Raw audio file processing (.wav/.mp3) is not yet supported" );
123+ throw std::runtime_error (" Raw audio file processing not supported" );
270124 } else {
271- ET_LOG (
272- Error,
273- " Unsupported audio file format: %s (only .bin files are supported)" ,
274- audio_path.c_str ());
125+ ET_LOG (Error, " Unsupported audio file format: %s" , audio_path.c_str ());
275126 throw std::runtime_error (" Unsupported audio file format" );
276127 }
277128}
@@ -286,7 +137,6 @@ int32_t main(int32_t argc, char** argv) {
286137 const char * tokenizer_path = FLAGS_tokenizer_path.c_str ();
287138 const char * prompt = FLAGS_prompt.c_str ();
288139 const char * audio_path = FLAGS_audio_path.c_str ();
289- const char * processor_path = FLAGS_processor_path.c_str ();
290140 float temperature = FLAGS_temperature;
291141 int32_t cpu_threads = FLAGS_cpu_threads;
292142 bool warmup = FLAGS_warmup;
@@ -334,7 +184,7 @@ int32_t main(int32_t argc, char** argv) {
334184 inputs.emplace_back (make_text_input (" <s>[INST][BEGIN_AUDIO]" ));
335185
336186 // 2. Add audio input
337- inputs.emplace_back (processAudioFile (audio_path, processor_path ));
187+ inputs.emplace_back (processAudioFile (audio_path));
338188
339189 // 3. Add text input (the actual user-submitted prompt)
340190 inputs.emplace_back (make_text_input (std::string (prompt) + " [/INST]" ));
0 commit comments