pytorch · larryliu0820 · Oct 9, 2025 · Oct 9, 2025
@@ -41,8 +41,8 @@ To run the model, we will use the Voxtral runner, which utilizes ExecuTorch's Mu
 The Voxtral runner will do the following things:
 
 - Audio Input:
-  - Option A:  Pass the raw audio tensor into exported preprocessor to produce a mel spectrogram tensor.
-  - Option B:  If starting directly with an already processed audio input tensor, format the inputs to the multimodal runner (metadata tokens, audio tokens, text tokens, etc.).
+   - Option A:  Pass raw audio data from a `.wav` file into the exported preprocessor to produce a mel spectrogram tensor.
+   - Option B:  If starting directly with an already processed audio input tensor (preprocessed mel spectrogram), format the inputs to the multimodal runner (metadata tokens, audio tokens, text tokens, etc.).
 - Feed the formatted inputs to the multimodal modal runner.
 
 
@@ -66,13 +66,26 @@ cmake -DCMAKE_INSTALL_PREFIX=cmake-out -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Re
 
 ## Running the model
 You can download the `tekken.json` tokenizer from [Voxtral's HuggingFace repo](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507).
+
+### Running with raw audio (.wav file)
+For raw audio files (`.wav`), you must provide a preprocessor to convert the audio into mel spectrogram format:
+```
+./cmake-out/examples/models/voxtral/voxtral_runner \
+  --model_path path/to/model.pte \
+  --tokenizer_path path/to/tekken.json \
+  --prompt "What can you tell me about this audio?" \
+  --audio_path path/to/audio_input.wav \
+  --processor_path path/to/voxtral_preprocessor.pte
+```
+
+### Running with preprocessed audio (.bin file)
+If you already have a preprocessed mel spectrogram saved as a `.bin` file, you can skip the preprocessor:
 ```
 ./cmake-out/examples/models/voxtral/voxtral_runner \
   --model_path path/to/model.pte \
   --tokenizer_path path/to/tekken.json \
   --prompt "What can you tell me about this audio?" \
-  --audio_path path/to/audio_input.bin \
-  --processor_path path/to/voxtral_preprocessor.pte # If you're passing raw audio file in audio_path
+  --audio_path path/to/preprocessed_audio.bin
 ```
 
 Example output:

@@ -21,6 +21,7 @@
 #include <executorch/extension/llm/runner/llm_runner_helper.h>
 #include <executorch/extension/llm/runner/multimodal_input.h>
 #include <executorch/extension/llm/runner/multimodal_runner.h>
+#include <executorch/extension/llm/runner/wav_loader.h>
 #include <executorch/runtime/core/error.h>
 #include <executorch/runtime/platform/log.h>
 
@@ -34,6 +35,7 @@ DEFINE_string(
     "multimodal.pte",
     "Model serialized in flatbuffer format.");
 
+DEFINE_string(data_path, "", "Path to data file.");
 DEFINE_string(tokenizer_path, "tekken.json", "Tokenizer stuff.");
 
 DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
@@ -113,15 +115,15 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
 }
 
 /**
- * @brief Loads a .bin file into a tensor and processes it using a .pte
- * processor
+ * @brief Loads raw audio from a .bin or .wav file and processes it using a
+ * .pte processor
  *
- * This function loads raw audio data from a .bin file (similar to
- * loadPreprocessedAudio), creates a tensor from it, and then passes it through
- * a processor module loaded from a .pte file to generate processed audio
- * features.
+ * This function loads raw audio data from either a .bin file (raw float array)
+ * or a .wav file (WAV format with headers), creates a tensor from it, and then
+ * passes it through a processor module loaded from a .pte file to generate
+ * processed audio features.
  *
- * @param audio_path Path to the .bin audio file
+ * @param audio_path Path to the .bin or .wav audio file
  * @param processor_path Path to the .pte processor file
  * @return MultimodalInput containing the processed audio data
  * @throws std::runtime_error if file loading or processing fails
@@ -135,6 +137,41 @@ MultimodalInput processRawAudioFile(
         "Processor path is required for raw audio processing");
   }
 
+  // Load the audio data from file (.bin or .wav)
+  std::vector<float> audio_data;
+  if (ends_with(audio_path, ".wav")) {
+    audio_data = ::executorch::extension::llm::load_wav_audio_data(audio_path);
+    ET_LOG(
+        Info,
+        "Loaded WAV file: %s, %zu samples",
+        audio_path.c_str(),
+        audio_data.size());
+  } else if (ends_with(audio_path, ".bin")) {
+    std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
+    if (!f.is_open()) {
+      ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
+      throw std::runtime_error("Failed to open audio file");
+    }
+
+    std::size_t n_floats = f.tellg() / sizeof(float);
+    f.seekg(0, std::ios::beg);
+
+    audio_data.resize(n_floats);
+    f.read(
+        reinterpret_cast<char*>(audio_data.data()),
+        audio_data.size() * sizeof(float));
+    f.close();
+
+    ET_LOG(
+        Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
+  } else {
+    ET_LOG(
+        Error,
+        "Unsupported audio file format: %s (only .bin and .wav files are supported)",
+        audio_path.c_str());
+    throw std::runtime_error("Unsupported audio file format");
+  }
+
   // Load the audio processor .pte.
   std::unique_ptr<Module> processor_module;
   try {
@@ -153,25 +190,6 @@ MultimodalInput processRawAudioFile(
     throw std::runtime_error("Exception while loading processor module");
   }
 
-  // Load the audio data from file.
-  std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
-  if (!f.is_open()) {
-    ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
-    throw std::runtime_error("Failed to open audio file");
-  }
-
-  std::size_t n_floats = f.tellg() / sizeof(float);
-  f.seekg(0, std::ios::beg);
-
-  std::vector<float> audio_data(n_floats);
-  f.read(
-      reinterpret_cast<char*>(audio_data.data()),
-      audio_data.size() * sizeof(float));
-  f.close();
-
-  ET_LOG(
-      Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
-
   // Execute the processor
   std::vector<executorch::aten::SizesType> tensor_shape = {
       static_cast<executorch::aten::SizesType>(audio_data.size())};
@@ -226,33 +244,39 @@ MultimodalInput processRawAudioFile(
  *
  * Dispatches audio file processing based on file extension and processor
  * availability:
+ * - .wav files: Requires processor, processes raw audio through processor
  * - .bin files with processor: Loads raw audio from .bin and processes through
  * processor
  * - .bin files without processor: Loads preprocessed mel spectrogram features
  * directly
  *
- * @param audio_path Path to the audio file (.bin)
- * @param processor_path Path to the processor .pte file (optional)
+ * @param audio_path Path to the audio file (.bin or .wav)
+ * @param processor_path Path to the processor .pte file (optional for .bin,
+ * required for .wav)
  * @return MultimodalInput containing the processed audio data
  * @throws std::runtime_error if file format is unsupported or processing fails
  */
 MultimodalInput processAudioFile(
     const std::string& audio_path,
     const std::string& processor_path = "") {
-  if (ends_with(audio_path, ".bin")) {
-    if (!processor_path.empty()) {
-      // Process raw audio from .bin file through the processor
-      return processRawAudioFile(audio_path, processor_path);
-    } else {
-      // Load preprocessed audio stored as a binary file (existing behavior)
-      return loadPreprocessedAudio(audio_path);
+  if (ends_with(audio_path, ".wav") || ends_with(audio_path, ".bin")) {
+    if (processor_path.empty()) {
+      if (ends_with(audio_path, ".wav")) {
+        ET_CHECK_MSG(
+            false,
+            "Processor path is required for .wav file processing: %s",
+            audio_path.c_str());
+      } else {
+        // Load preprocessed audio stored as a binary file (existing behavior)
+        return loadPreprocessedAudio(audio_path);
+      }
     }
+    return processRawAudioFile(audio_path, processor_path);
   } else {
-    ET_LOG(
-        Error,
-        "Unsupported audio file format: %s (only .bin files are supported)",
+    ET_CHECK_MSG(
+        false,
+        "Unsupported audio file format: %s (only .bin and .wav files are supported)",
         audio_path.c_str());
-    throw std::runtime_error("Unsupported audio file format");
   }
 }
 
@@ -267,6 +291,7 @@ int32_t main(int32_t argc, char** argv) {
   const char* prompt = FLAGS_prompt.c_str();
   const char* audio_path = FLAGS_audio_path.c_str();
   const char* processor_path = FLAGS_processor_path.c_str();
+  const char* data_path = FLAGS_data_path.c_str();
   float temperature = FLAGS_temperature;
   int32_t cpu_threads = FLAGS_cpu_threads;
   bool warmup = FLAGS_warmup;
@@ -294,7 +319,7 @@ int32_t main(int32_t argc, char** argv) {
   // Create multimodal runner
   std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner =
       ::executorch::extension::llm::create_multimodal_runner(
-          model_path, std::move(tokenizer));
+          model_path, std::move(tokenizer), data_path);
   if (runner == nullptr) {
     ET_LOG(Error, "Failed to create multimodal runner");
     return 1;

@@ -105,6 +105,7 @@ def define_common_targets():
             exported_headers = [
                 "audio.h",
                 "image.h",
+                "wav_loader.h",
                 "multimodal_input.h",
                 "multimodal_runner.h",
                 "multimodal_prefiller.h",

@@ -19,7 +19,7 @@ include(${EXECUTORCH_ROOT}/tools/cmake/Test.cmake)
 
 set(_test_srcs
     test_generation_config.cpp test_text_llm_runner.cpp test_text_prefiller.cpp
-    test_text_decoder_runner.cpp test_multimodal_input.cpp
+    test_text_decoder_runner.cpp test_multimodal_input.cpp test_wav_loader.cpp
 )
 
 # Add LSan stub for Apple platforms

@@ -44,3 +44,13 @@ def define_common_targets():
             "//executorch/extension/llm/runner:multimodal_runner_lib",
         ],
     )
+
+    runtime.cxx_test(
+        name = "test_wav_loader",
+        srcs = ["test_wav_loader.cpp"],
+        deps = [
+            "//executorch/extension/testing_util:temp_file",
+            "//executorch/extension/llm/runner:multimodal_runner_lib",
+            "//executorch/runtime/platform:platform",
+        ],
+    )