Include audio preprocessing for raw audio tensor (#13855)

pytorchbot · web-flow · commit 624463e983b6 · 2025-09-03T16:59:47.000-07:00
## Summary Runs audio preprocessing (mel spectrogram conversion) on raw audio tensor .bin file, using an exported `.pte` from https://github.com/pytorch/executorch/blob/main/extension/audio/mel_spectrogram.py Current limitations - no batching of output in the spectrogram processing module, so can only support audio of <30 seconds. ``` The speaker in this audio seems to be talking about their concerns about a device called the model or maybe they're just talking about the model in general. They mention that the model was trained with the speaker for inference, which suggests that the model was trained based on the speaker's data or instructions. They also mention that the volume is quite small, which could imply that the speaker is trying to control the volume of the model's output, likely because they are concerned about how loud the model's responses might PyTorchObserver {"prompt_tokens":388,"generated_tokens":99,"model_load_start_ms":0,"model_load_end_ms":0,"inference_start_ms":1756351346381,"inference_end_ms":1756351362602,"prompt_eval_end_ms":1756351351435,"first_token_ms":1756351351435,"aggregate_sampling_time_ms":99,"SCALING_FACTOR_UNITS_PER_SECOND":1000} I 00:00:24.036773 executorch:stats.h:104] Prompt Tokens: 388 Generated Tokens: 99 I 00:00:24.036800 executorch:stats.h:110] Model Load Time: 0.000000 (seconds) I 00:00:24.036805 executorch:stats.h:117] Total inference time: 16.221000 (seconds) Rate: 6.103200 (tokens/second) I 00:00:24.036815 executorch:stats.h:127] Prompt evaluation: 5.054000 (seconds) Rate: 76.770875 (tokens/second) I 00:00:24.036819 executorch:stats.h:136] Generated 99 tokens: 11.167000 (seconds) Rate: 8.865407 (tokens/second) I 00:00:24.036822 executorch:stats.h:147] Time to first generated token: 5.054000 (seconds) I 00:00:24.036828 executorch:stats.h:153] Sampling time over 487 tokens: 0.099000 (seconds) ```
diff --git a/examples/models/voxtral/multimodal.cpp b/examples/models/voxtral/multimodal.cpp
@@ -12,6 +12,10 @@
 
 #include <gflags/gflags.h>
 
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+#include <executorch/runtime/core/evalue.h>
+
 #include <executorch/extension/llm/runner/audio.h>
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/extension/llm/runner/llm_runner_helper.h>
@@ -36,6 +40,11 @@ DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
 
 DEFINE_string(audio_path, "", "Path to input audio file.");
 
+DEFINE_string(
+    processor_path,
+    "",
+    "Path to processor .pte file for raw audio processing.");
+
 DEFINE_double(
     temperature,
     0.8f,
@@ -50,16 +59,48 @@ DEFINE_bool(warmup, false, "Whether to run a warmup run.");
 
 namespace {
 
+using ::executorch::extension::from_blob;
+using ::executorch::extension::Module;
 using ::executorch::extension::llm::Image;
 using ::executorch::extension::llm::make_image_input;
 using ::executorch::extension::llm::make_text_input;
 using ::executorch::extension::llm::MultimodalInput;
+using ::executorch::runtime::EValue;
 
 bool ends_with(const std::string& str, const std::string& suffix) {
   return str.size() >= suffix.size() &&
       str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
 }
 
+/**
+ * @brief Loads float data from a binary file
+ *
+ * @param audio_path Path to the binary audio file (.bin)
+ * @return Vector of float data loaded from the file
+ * @throws std::runtime_error if file loading fails
+ */
+std::vector<float> loadBinaryFloatData(const std::string& audio_path) {
+  std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
+  if (!f.is_open()) {
+    ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
+    throw std::runtime_error("Failed to open audio file");
+  }
+
+  std::size_t n_floats =
+      f.tellg() / sizeof(float); // Number of floats in the audio file
+  f.seekg(0, std::ios::beg);
+
+  std::vector<float> audio_data(n_floats);
+  f.read(
+      reinterpret_cast<char*>(audio_data.data()),
+      audio_data.size() * sizeof(float));
+  f.close();
+
+  ET_LOG(
+      Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
+  return audio_data;
+}
+
 /**
  * @brief Loads preprocessed audio data from a binary file
  *
@@ -73,22 +114,19 @@ bool ends_with(const std::string& str, const std::string& suffix) {
  * @return MultimodalInput containing the loaded audio data
  */
 MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
-  std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
+  std::vector<float> audio_data = loadBinaryFloatData(audio_path);
+
   int32_t n_bins = 128;
   int32_t n_frames = 3000;
-  std::size_t n_floats =
-      f.tellg() / sizeof(float); // Number of floats in the audio file.
-  f.seekg(0, std::ios::beg);
+
+  std::size_t n_floats = audio_data.size();
   int32_t batch_size = ceil(
       n_floats /
       (n_bins * n_frames)); // Batch in increments of n_frames, rounding up.
-  std::vector<float> audio_data(batch_size * n_bins * n_frames);
-  f.read(
-      reinterpret_cast<char*>(audio_data.data()),
-      audio_data.size() * sizeof(float));
 
   ET_LOG(Info, "audio_data len = %d", audio_data.size());
 
+  // Create Audio multimodal input
   auto audio = std::make_unique<::executorch::extension::llm::Audio>();
   audio->batch_size = batch_size;
   audio->n_bins = n_bins;
@@ -100,29 +138,140 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
 }
 
 /**
- * @brief Processes audio files for multimodal input
+ * @brief Loads a .bin file into a tensor and processes it using a .pte
+ * processor
  *
- * Dispatches audio file processing based on file extension:
- * - .bin files: Loads preprocessed mel spectrogram features directly
- * - .wav/.mp3 files: Currently unsupported, throws runtime_error
+ * This function loads raw audio data from a .bin file (similar to
+ * loadPreprocessedAudio), creates a tensor from it, and then passes it through
+ * a processor module loaded from a .pte file to generate processed audio
+ * features.
+ *
+ * @param audio_path Path to the .bin audio file
+ * @param processor_path Path to the .pte processor file
+ * @return MultimodalInput containing the processed audio data
+ * @throws std::runtime_error if file loading or processing fails
+ */
+MultimodalInput processRawAudioFile(
+    const std::string& audio_path,
+    const std::string& processor_path) {
+  if (processor_path.empty()) {
+    ET_LOG(Error, "Processor path is required for raw audio processing");
+    throw std::runtime_error(
+        "Processor path is required for raw audio processing");
+  }
+
+  // Load the audio processor .pte.
+  std::unique_ptr<Module> processor_module;
+  try {
+    processor_module =
+        std::make_unique<Module>(processor_path, Module::LoadMode::File);
+    auto load_error = processor_module->load();
+    if (load_error != ::executorch::runtime::Error::Ok) {
+      ET_LOG(
+          Error,
+          "Failed to load processor module from: %s",
+          processor_path.c_str());
+      throw std::runtime_error("Failed to load processor module");
+    }
+  } catch (const std::exception& e) {
+    ET_LOG(Error, "Exception while loading processor module: %s", e.what());
+    throw std::runtime_error("Exception while loading processor module");
+  }
+
+  // Load the audio data from file.
+  std::vector<float> audio_data = loadBinaryFloatData(audio_path);
+
+  // Execute the processor
+  std::vector<executorch::aten::SizesType> tensor_shape = {
+      static_cast<executorch::aten::SizesType>(audio_data.size())};
+  auto input_tensor = from_blob(
+      audio_data.data(), tensor_shape, ::executorch::aten::ScalarType::Float);
+
+  ET_LOG(Info, "Processing audio through processor module...");
+  auto result = processor_module->execute("forward", input_tensor);
+  if (!result.ok()) {
+    ET_LOG(Error, "Failed to execute processor's forward method");
+    throw std::runtime_error("Failed to execute processor forward method");
+  }
+
+  auto outputs = result.get();
+  if (outputs.empty()) {
+    ET_LOG(Error, "Processor returned no outputs");
+    throw std::runtime_error("Processor returned no outputs");
+  }
+
+  // Extract processed audio features
+  const auto& processed_tensor = outputs[0].toTensor();
+  const float* processed_data = processed_tensor.const_data_ptr<float>();
+  const auto& sizes = processed_tensor.sizes();
+
+  ET_LOG(
+      Info,
+      "Processed audio tensor shape: [%d, %d, %d]",
+      static_cast<int>(sizes[0]),
+      static_cast<int>(sizes[1]),
+      static_cast<int>(sizes[2]));
+
+  // Create Audio multimodal input from processed features
+  auto processed_audio =
+      std::make_unique<::executorch::extension::llm::Audio>();
+  processed_audio->batch_size =
+      static_cast<int32_t>(sizes[0]); // Note: batching for s > 30 doesn't work
+                                      // yet, so this will just be = 1.
+  processed_audio->n_bins = static_cast<int32_t>(sizes[1]);
+  processed_audio->n_frames =
+      static_cast<int32_t>(sizes[2]); // And this will just be = 3000.
+
+  size_t total_elements = processed_audio->batch_size *
+      processed_audio->n_bins * processed_audio->n_frames;
+  processed_audio->data.resize(total_elements * sizeof(float));
+  std::memcpy(
+      processed_audio->data.data(),
+      processed_data,
+      total_elements * sizeof(float));
+
+  ET_LOG(
+      Info,
+      "Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d",
+      processed_audio->batch_size,
+      processed_audio->n_bins,
+      processed_audio->n_frames);
+
+  return ::executorch::extension::llm::make_audio_input(
+      std::move(*processed_audio));
+}
+
+/**
+ * @brief Processes audio files for multimodal input
  *
- * This function provides a interface for different audio input formats
- * and can be extended to support raw audio processing in the future.
+ * Dispatches audio file processing based on file extension and processor
+ * availability:
+ * - .bin files with processor: Loads raw audio from .bin and processes through
+ * processor
+ * - .bin files without processor: Loads preprocessed mel spectrogram features
+ * directly
  *
- * @param audio_path Path to the audio file
+ * @param audio_path Path to the audio file (.bin)
+ * @param processor_path Path to the processor .pte file (optional)
  * @return MultimodalInput containing the processed audio data
  * @throws std::runtime_error if file format is unsupported or processing fails
  */
-MultimodalInput processAudioFile(const std::string& audio_path) {
+MultimodalInput processAudioFile(
+    const std::string& audio_path,
+    const std::string& processor_path = "") {
   if (ends_with(audio_path, ".bin")) {
-    // Current behavior - load preprocessed audio stored as a binary file.
-    return loadPreprocessedAudio(audio_path);
-  } else if (ends_with(audio_path, ".wav") || ends_with(audio_path, ".mp3")) {
-    // New: Process raw audio files - unsupported for now
-    ET_LOG(Error, "Raw audio file processing (.wav/.mp3) is not yet supported");
-    throw std::runtime_error("Raw audio file processing not supported");
+    if (!processor_path.empty()) {
+      // Process raw audio from .bin file through the processor
+      return processRawAudioFile(audio_path, processor_path);
+    } else {
+      // Load preprocessed audio stored as a binary file (existing behavior)
+      return loadPreprocessedAudio(audio_path);
+    }
   } else {
-    ET_LOG(Error, "Unsupported audio file format: %s", audio_path.c_str());
+    ET_LOG(
+        Error,
+        "Unsupported audio file format: %s (only .bin files are supported)",
+        audio_path.c_str());
     throw std::runtime_error("Unsupported audio file format");
   }
 }
@@ -137,6 +286,7 @@ int32_t main(int32_t argc, char** argv) {
   const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
   const char* prompt = FLAGS_prompt.c_str();
   const char* audio_path = FLAGS_audio_path.c_str();
+  const char* processor_path = FLAGS_processor_path.c_str();
   float temperature = FLAGS_temperature;
   int32_t cpu_threads = FLAGS_cpu_threads;
   bool warmup = FLAGS_warmup;
@@ -184,7 +334,7 @@ int32_t main(int32_t argc, char** argv) {
   inputs.emplace_back(make_text_input("<s>[INST][BEGIN_AUDIO]"));
 
   // 2. Add audio input
-  inputs.emplace_back(processAudioFile(audio_path));
+  inputs.emplace_back(processAudioFile(audio_path, processor_path));
 
   // 3. Add text input (the actual user-submitted prompt)
   inputs.emplace_back(make_text_input(std::string(prompt) + "[/INST]"));