pytorch
diff --git a/‎backends/arm/operators/op_view.py‎
Lines changed: 7 additions & 1 deletion b/‎backends/arm/operators/op_view.py‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎backends/arm/test/ops/test_linear.py‎
Lines changed: 68 additions & 3 deletions b/‎backends/arm/test/ops/test_linear.py‎
Lines changed: 68 additions & 3 deletions
diff --git a/‎examples/models/voxtral/multimodal.cpp‎
Lines changed: 174 additions & 24 deletions b/‎examples/models/voxtral/multimodal.cpp‎
Lines changed: 174 additions & 24 deletions
@@ -44,7 +44,13 @@ def define_node(
         validate_valid_dtype(
             self.target,
             [inputs[0], output],
-            [ts.DType.INT8, ts.DType.INT32, ts.DType.FP32, ts.DType.BOOL],
+            [
+                ts.DType.INT8,
+                ts.DType.INT16,
+                ts.DType.INT32,
+                ts.DType.FP32,
+                ts.DType.BOOL,
+            ],
             output.tosa_spec,
         )
 
 
@@ -9,9 +9,12 @@
 from typing import Tuple
 
 import pytest
-
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.quantizer.arm_quantizer import (
+    get_symmetric_a16w8_quantization_config,
+    TOSAQuantizer,
+)
+from executorch.backends.arm.test import common, conftest
 
 from executorch.backends.arm.test.tester.test_pipeline import (
     EthosU55PipelineINT,
@@ -20,6 +23,8 @@
     TosaPipelineINT,
     VgfPipeline,
 )
+from executorch.backends.arm.tosa_specification import TosaSpecification
+from executorch.backends.xnnpack.test.tester import Quantize
 
 aten_op = "torch.ops.aten.linear.default"
 
@@ -143,7 +148,6 @@ def test_linear_tosa_FP(test_data: torch.Tensor):
     pipeline.run()
 
 
-@pytest.mark.flaky(reruns=5)  # TODO: Investigate flakyness.
 @common.parametrize("test_data", test_data_rank1_INT | test_data_rank4_INT)
 def test_linear_tosa_INT(test_data: torch.Tensor):
     test_data, out_features, has_bias, per_channel_quantization = test_data()
@@ -243,3 +247,64 @@ def test_linear_vgf_INT(test_data: torch.Tensor):
         per_channel_quantization=per_channel_quantization,
     )
     pipeline.run()
+
+
+def get_symmetric_a16w8_linear_quantizer(
+    u55_config=False, per_channel_quantization=False
+):
+    tosa_version = conftest.get_option("tosa_version")
+    tosa_profiles = {
+        "1.0": TosaSpecification.create_from_string("TOSA-1.0+INT+int16"),
+    }
+
+    quantizer = TOSAQuantizer(tosa_profiles[tosa_version])
+    quantizer.set_global(
+        get_symmetric_a16w8_quantization_config(is_per_channel=per_channel_quantization)
+    )
+    quantizer.set_module_type(
+        torch.nn.Linear,
+        get_symmetric_a16w8_quantization_config(
+            is_per_channel=per_channel_quantization
+        ),
+    )
+
+    return Quantize(
+        quantizer,
+        get_symmetric_a16w8_quantization_config(
+            is_per_channel=per_channel_quantization
+        ),
+    )
+
+
+@common.parametrize("test_data", test_data_rank1_INT | test_data_rank4_INT)
+@pytest.mark.xfail(
+    reason="missing int16 linear ops support; fails at TOSA reference model run with Invalid TOSA graph"
+)
+def test_linear_16a8w_tosa_INT(test_data: torch.Tensor):
+    """Test linear operation with 16A8W quantization (16-bit activations, 8-bit weights)"""
+    test_data, out_features, has_bias, per_channel_quantization = test_data()
+    in_features = test_data.shape[-1]
+
+    # Create pipeline with custom 16A8W quantization config
+    pipeline = TosaPipelineINT[input_t1](
+        Linear(
+            in_features=in_features,
+            out_features=out_features,
+            bias=has_bias,
+        ),
+        (test_data,),
+        aten_op,
+        exir_op=[],
+        per_channel_quantization=per_channel_quantization,
+        use_to_edge_transform_and_lower=True,
+        tosa_extensions=["int16"],
+    )
+
+    pipeline.change_args(
+        "quantize",
+        get_symmetric_a16w8_linear_quantizer(
+            per_channel_quantization=per_channel_quantization
+        ),
+    )
+    # Run the pipeline
+    pipeline.run()
@@ -12,6 +12,10 @@
 
 #include <gflags/gflags.h>
 
+#include <executorch/extension/module/module.h>
+#include <executorch/extension/tensor/tensor_ptr_maker.h>
+#include <executorch/runtime/core/evalue.h>
+
 #include <executorch/extension/llm/runner/audio.h>
 #include <executorch/extension/llm/runner/image.h>
 #include <executorch/extension/llm/runner/llm_runner_helper.h>
@@ -36,6 +40,11 @@ DEFINE_string(prompt, "What is happening in this audio?", "Text prompt.");
 
 DEFINE_string(audio_path, "", "Path to input audio file.");
 
+DEFINE_string(
+    processor_path,
+    "",
+    "Path to processor .pte file for raw audio processing.");
+
 DEFINE_double(
     temperature,
     0.8f,
@@ -50,16 +59,48 @@ DEFINE_bool(warmup, false, "Whether to run a warmup run.");
 
 namespace {
 
+using ::executorch::extension::from_blob;
+using ::executorch::extension::Module;
 using ::executorch::extension::llm::Image;
 using ::executorch::extension::llm::make_image_input;
 using ::executorch::extension::llm::make_text_input;
 using ::executorch::extension::llm::MultimodalInput;
+using ::executorch::runtime::EValue;
 
 bool ends_with(const std::string& str, const std::string& suffix) {
   return str.size() >= suffix.size() &&
       str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
 }
 
+/**
+ * @brief Loads float data from a binary file
+ *
+ * @param audio_path Path to the binary audio file (.bin)
+ * @return Vector of float data loaded from the file
+ * @throws std::runtime_error if file loading fails
+ */
+std::vector<float> loadBinaryFloatData(const std::string& audio_path) {
+  std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
+  if (!f.is_open()) {
+    ET_LOG(Error, "Failed to open audio file: %s", audio_path.c_str());
+    throw std::runtime_error("Failed to open audio file");
+  }
+
+  std::size_t n_floats =
+      f.tellg() / sizeof(float); // Number of floats in the audio file
+  f.seekg(0, std::ios::beg);
+
+  std::vector<float> audio_data(n_floats);
+  f.read(
+      reinterpret_cast<char*>(audio_data.data()),
+      audio_data.size() * sizeof(float));
+  f.close();
+
+  ET_LOG(
+      Info, "Loaded .bin file: %s, %zu floats", audio_path.c_str(), n_floats);
+  return audio_data;
+}
+
 /**
  * @brief Loads preprocessed audio data from a binary file
  *
@@ -73,22 +114,19 @@ bool ends_with(const std::string& str, const std::string& suffix) {
  * @return MultimodalInput containing the loaded audio data
  */
 MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
-  std::ifstream f(audio_path, std::ios::binary | std::ios::ate);
+  std::vector<float> audio_data = loadBinaryFloatData(audio_path);
+
   int32_t n_bins = 128;
   int32_t n_frames = 3000;
-  std::size_t n_floats =
-      f.tellg() / sizeof(float); // Number of floats in the audio file.
-  f.seekg(0, std::ios::beg);
+
+  std::size_t n_floats = audio_data.size();
   int32_t batch_size = ceil(
       n_floats /
       (n_bins * n_frames)); // Batch in increments of n_frames, rounding up.
-  std::vector<float> audio_data(batch_size * n_bins * n_frames);
-  f.read(
-      reinterpret_cast<char*>(audio_data.data()),
-      audio_data.size() * sizeof(float));
 
   ET_LOG(Info, "audio_data len = %d", audio_data.size());
 
+  // Create Audio multimodal input
   auto audio = std::make_unique<::executorch::extension::llm::Audio>();
   audio->batch_size = batch_size;
   audio->n_bins = n_bins;
@@ -100,29 +138,140 @@ MultimodalInput loadPreprocessedAudio(const std::string& audio_path) {
 }
 
 /**
- * @brief Processes audio files for multimodal input
+ * @brief Loads a .bin file into a tensor and processes it using a .pte
+ * processor
  *
- * Dispatches audio file processing based on file extension:
- * - .bin files: Loads preprocessed mel spectrogram features directly
- * - .wav/.mp3 files: Currently unsupported, throws runtime_error
+ * This function loads raw audio data from a .bin file (similar to
+ * loadPreprocessedAudio), creates a tensor from it, and then passes it through
+ * a processor module loaded from a .pte file to generate processed audio
+ * features.
+ *
+ * @param audio_path Path to the .bin audio file
+ * @param processor_path Path to the .pte processor file
+ * @return MultimodalInput containing the processed audio data
+ * @throws std::runtime_error if file loading or processing fails
+ */
+MultimodalInput processRawAudioFile(
+    const std::string& audio_path,
+    const std::string& processor_path) {
+  if (processor_path.empty()) {
+    ET_LOG(Error, "Processor path is required for raw audio processing");
+    throw std::runtime_error(
+        "Processor path is required for raw audio processing");
+  }
+
+  // Load the audio processor .pte.
+  std::unique_ptr<Module> processor_module;
+  try {
+    processor_module =
+        std::make_unique<Module>(processor_path, Module::LoadMode::File);
+    auto load_error = processor_module->load();
+    if (load_error != ::executorch::runtime::Error::Ok) {
+      ET_LOG(
+          Error,
+          "Failed to load processor module from: %s",
+          processor_path.c_str());
+      throw std::runtime_error("Failed to load processor module");
+    }
+  } catch (const std::exception& e) {
+    ET_LOG(Error, "Exception while loading processor module: %s", e.what());
+    throw std::runtime_error("Exception while loading processor module");
+  }
+
+  // Load the audio data from file.
+  std::vector<float> audio_data = loadBinaryFloatData(audio_path);
+
+  // Execute the processor
+  std::vector<executorch::aten::SizesType> tensor_shape = {
+      static_cast<executorch::aten::SizesType>(audio_data.size())};
+  auto input_tensor = from_blob(
+      audio_data.data(), tensor_shape, ::executorch::aten::ScalarType::Float);
+
+  ET_LOG(Info, "Processing audio through processor module...");
+  auto result = processor_module->execute("forward", input_tensor);
+  if (!result.ok()) {
+    ET_LOG(Error, "Failed to execute processor's forward method");
+    throw std::runtime_error("Failed to execute processor forward method");
+  }
+
+  auto outputs = result.get();
+  if (outputs.empty()) {
+    ET_LOG(Error, "Processor returned no outputs");
+    throw std::runtime_error("Processor returned no outputs");
+  }
+
+  // Extract processed audio features
+  const auto& processed_tensor = outputs[0].toTensor();
+  const float* processed_data = processed_tensor.const_data_ptr<float>();
+  const auto& sizes = processed_tensor.sizes();
+
+  ET_LOG(
+      Info,
+      "Processed audio tensor shape: [%d, %d, %d]",
+      static_cast<int>(sizes[0]),
+      static_cast<int>(sizes[1]),
+      static_cast<int>(sizes[2]));
+
+  // Create Audio multimodal input from processed features
+  auto processed_audio =
+      std::make_unique<::executorch::extension::llm::Audio>();
+  processed_audio->batch_size =
+      static_cast<int32_t>(sizes[0]); // Note: batching for s > 30 doesn't work
+                                      // yet, so this will just be = 1.
+  processed_audio->n_bins = static_cast<int32_t>(sizes[1]);
+  processed_audio->n_frames =
+      static_cast<int32_t>(sizes[2]); // And this will just be = 3000.
+
+  size_t total_elements = processed_audio->batch_size *
+      processed_audio->n_bins * processed_audio->n_frames;
+  processed_audio->data.resize(total_elements * sizeof(float));
+  std::memcpy(
+      processed_audio->data.data(),
+      processed_data,
+      total_elements * sizeof(float));
+
+  ET_LOG(
+      Info,
+      "Created processed Audio: batch_size=%d, n_bins=%d, n_frames=%d",
+      processed_audio->batch_size,
+      processed_audio->n_bins,
+      processed_audio->n_frames);
+
+  return ::executorch::extension::llm::make_audio_input(
+      std::move(*processed_audio));
+}
+
+/**
+ * @brief Processes audio files for multimodal input
  *
- * This function provides a interface for different audio input formats
- * and can be extended to support raw audio processing in the future.
+ * Dispatches audio file processing based on file extension and processor
+ * availability:
+ * - .bin files with processor: Loads raw audio from .bin and processes through
+ * processor
+ * - .bin files without processor: Loads preprocessed mel spectrogram features
+ * directly
  *
- * @param audio_path Path to the audio file
+ * @param audio_path Path to the audio file (.bin)
+ * @param processor_path Path to the processor .pte file (optional)
  * @return MultimodalInput containing the processed audio data
  * @throws std::runtime_error if file format is unsupported or processing fails
  */
-MultimodalInput processAudioFile(const std::string& audio_path) {
+MultimodalInput processAudioFile(
+    const std::string& audio_path,
+    const std::string& processor_path = "") {
   if (ends_with(audio_path, ".bin")) {
-    // Current behavior - load preprocessed audio stored as a binary file.
-    return loadPreprocessedAudio(audio_path);
-  } else if (ends_with(audio_path, ".wav") || ends_with(audio_path, ".mp3")) {
-    // New: Process raw audio files - unsupported for now
-    ET_LOG(Error, "Raw audio file processing (.wav/.mp3) is not yet supported");
-    throw std::runtime_error("Raw audio file processing not supported");
+    if (!processor_path.empty()) {
+      // Process raw audio from .bin file through the processor
+      return processRawAudioFile(audio_path, processor_path);
+    } else {
+      // Load preprocessed audio stored as a binary file (existing behavior)
+      return loadPreprocessedAudio(audio_path);
+    }
   } else {
-    ET_LOG(Error, "Unsupported audio file format: %s", audio_path.c_str());
+    ET_LOG(
+        Error,
+        "Unsupported audio file format: %s (only .bin files are supported)",
+        audio_path.c_str());
     throw std::runtime_error("Unsupported audio file format");
   }
 }
@@ -137,6 +286,7 @@ int32_t main(int32_t argc, char** argv) {
   const char* tokenizer_path = FLAGS_tokenizer_path.c_str();
   const char* prompt = FLAGS_prompt.c_str();
   const char* audio_path = FLAGS_audio_path.c_str();
+  const char* processor_path = FLAGS_processor_path.c_str();
   float temperature = FLAGS_temperature;
   int32_t cpu_threads = FLAGS_cpu_threads;
   bool warmup = FLAGS_warmup;
@@ -184,7 +334,7 @@ int32_t main(int32_t argc, char** argv) {
   inputs.emplace_back(make_text_input("<s>[INST][BEGIN_AUDIO]"));
 
   // 2. Add audio input
-  inputs.emplace_back(processAudioFile(audio_path));
+  inputs.emplace_back(processAudioFile(audio_path, processor_path));
 
   // 3. Add text input (the actual user-submitted prompt)
   inputs.emplace_back(make_text_input(std::string(prompt) + "[/INST]"));