Mel spectrogram output stacking along batch dim

jackzhxng · facebook-github-bot · commit 5d0ce347d8c8 · 2025-09-12T12:19:30.000-07:00
Differential Revision: D81798729
diff --git a/extension/audio/mel_spectrogram.py b/extension/audio/mel_spectrogram.py
@@ -8,19 +8,17 @@
 import logging
 
 import torch
+from torch.export import Dim
 import torch.nn as nn
 import torch.nn.functional as F
 
 from executorch.backends.xnnpack.partition.xnnpack_partitioner import XnnpackPartitioner
-
 from executorch.exir import (
     EdgeCompileConfig,
     EdgeProgramManager,
     to_edge_transform_and_lower,
 )
 
-from torch.export import Dim, export, ExportedProgram
-
 
 class WhisperAudioProcessor(nn.Module):
     r"""
@@ -51,6 +49,8 @@ def __init__(
         chunk_length: int = 30,
         n_fft: int = 400,
         padding_value: float = 0.0,
+        max_audio_len: int = 600,
+        stack_output: bool = False,
     ) -> None:
         super().__init__()
         self.feature_size = feature_size
@@ -66,6 +66,9 @@ def __init__(
         self.mel_filters = self.get_mel_filters(
             sampling_rate, n_fft, n_mels=feature_size
         )
+        self.max_audio_len = max_audio_len
+        self.max_n_chunks = int(max_audio_len / chunk_length)
+        self.stack_output = stack_output
 
     def get_mel_filters(
         self, sr: int, n_fft: int, n_mels: int = 128, dtype: torch.dtype = torch.float32
@@ -131,12 +134,14 @@ def forward(self, waveform: torch.Tensor) -> torch.Tensor:
             [1, 80, 3000] with default options and 1 chunk
         """
         n_chunks = (waveform.shape[0] - 1) // self.n_samples + 1
+        torch._constrain_as_size(n_chunks, max=self.max_n_chunks)  # Explicitly sets the max bound, otherwise export complains about it being infinite.
         waveform = F.pad(
             waveform,
             (0, self.n_samples * n_chunks - waveform.shape[0]),
             mode="constant",
             value=self.padding_value,
         )
+
         # Ideally we should do:
         # window = torch.hann_window(self.n_fft)
         # but this is not currently supported when lowering.
@@ -166,18 +171,24 @@ def forward(self, waveform: torch.Tensor) -> torch.Tensor:
         log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
         log_spec = (log_spec + 4.0) / 4.0
 
-        return log_spec.unsqueeze(0)
+        if self.stack_output:
+            log_spec = log_spec.reshape(self.feature_size, -1, self.nb_max_frames)
+            log_spec = log_spec.transpose(0, 1)
+            return log_spec
+        else:
+            return log_spec.unsqueeze(0)
 
 
 def export_processor(model=None, output_file="whisper_preprocess.pte"):
     if model is None:
         model = WhisperAudioProcessor()
-    audio_tensor = torch.randn(480000)
-    chunk_tensor = audio_tensor[:93680]
-    with torch.no_grad():
-        dim = Dim("waveform", min=1600, max=audio_tensor.size(0) * 10)  # 10 chunks max
-        ep: ExportedProgram = export(
-            model, (chunk_tensor,), dynamic_shapes={"waveform": {0: dim}}, strict=True
+
+    audio_tensor = torch.randn(93680)
+    shapes_collection = torch.export.ShapesCollection()
+    shapes_collection[audio_tensor] = {0: Dim.DYNAMIC}
+    with torch.no_grad(), torch.fx.experimental._config.patch(backed_size_oblivious=True):
+        ep = torch.export.export(
+            model, (audio_tensor,), dynamic_shapes=shapes_collection, strict=True
         )
         logging.debug(ep)
 
@@ -236,6 +247,17 @@ def main():
         default="whisper_preprocess.pte",
         help="Output file path for the exported model",
     )
+    parser.add_argument(
+        "--max_audio_len",
+        type=int,
+        default=600,
+        help="Max audio length that can be processed, in seconds."
+    )
+    parser.add_argument(
+        "--stack_output",
+        action="store_true",
+        help="Whether to stack output along the batch dimension, one per chunk. Used by models such as Voxtral, see https://github.com/huggingface/transformers/blob/main/src/transformers/models/voxtral/processing_voxtral.py#L94 for more information."
+    )
 
     args = parser.parse_args()
 
@@ -245,6 +267,8 @@ def main():
         hop_length=args.hop_length,
         chunk_length=args.chunk_length,
         n_fft=args.n_fft,
+        max_audio_len=args.max_audio_len,
+        stack_output=args.stack_output,
     )
 
     export_processor(model, args.output_file)