parakeet cuda mode works (pytorch#16674)

Gasoonjia · web-flow · commit 4d5f3304751e · 2026-01-22T09:44:40.000-05:00
When exporting with Dim.AUTO, GPU-enabled PyTorch adds CUDA-specific
guards for convolution kernel based on cuDNN workspace calculations,
which constrain the dynamic dimension based on the sample tensor size.
With the original miscalculated 100-frame sample, this created a guard
limiting inputs to ~160 mel frames (~1.6 sec), causing runtime failures
for longer audio.
On Mac/CPU-only PyTorch, these cuDNN guards are never added since no
CUDA backend is selected, so NeMo's internal limit of 5000 frames is
preserved so that we could have correct inference.
This PR fix the issue by making sample tensor matching the desired max
audio duration (max_mel_frames), ensuring the cuDNN guard accommodates
the full input range.

Also introduces ci for parakeet running on cuda backend.
diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
@@ -21,6 +21,7 @@ Arguments:
                  - mistralai/Voxtral-Mini-3B-2507
                  - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
                  - google/gemma-3-4b-it
+                 - nvidia/parakeet-tdt
 
   quant_name   Quantization type (optional, default: non-quantized)
                Options:
@@ -34,6 +35,7 @@ Examples:
   export_model_artifact.sh metal "openai/whisper-small"
   export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
   export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
+  export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
 EOF
 }
 
@@ -101,9 +103,21 @@ case "$HF_MODEL" in
     PREPROCESSOR_FEATURE_SIZE=""
     PREPROCESSOR_OUTPUT=""
     ;;
+  nvidia/parakeet-tdt)
+    if [ "$DEVICE" = "metal" ]; then
+      echo "Error: Export for device 'metal' is not yet tested for model '$HF_MODEL'"
+      exit 1
+    fi
+    MODEL_NAME="parakeet"
+    TASK=""
+    MAX_SEQ_LEN=""
+    EXTRA_PIP=""
+    PREPROCESSOR_FEATURE_SIZE=""
+    PREPROCESSOR_OUTPUT=""
+    ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, nvidia/parakeet-tdt"
     exit 1
     ;;
 esac
@@ -141,6 +155,22 @@ if [ -n "$EXTRA_PIP" ]; then
 fi
 pip list
 
+# Parakeet uses a custom export script
+if [ "$MODEL_NAME" = "parakeet" ]; then
+  pip install -r examples/models/parakeet/install_requirements.txt
+
+  python examples/models/parakeet/export_parakeet_tdt.py \
+      --backend "$DEVICE" \
+      --output-dir "${OUTPUT_DIR}"
+
+  test -f "${OUTPUT_DIR}/model.pte"
+  test -f "${OUTPUT_DIR}/aoti_${DEVICE}_blob.ptd"
+  test -f "${OUTPUT_DIR}/tokenizer.model"
+  ls -al "${OUTPUT_DIR}"
+  echo "::endgroup::"
+  exit 0
+fi
+
 MAX_SEQ_LEN_ARG=""
 if [ -n "$MAX_SEQ_LEN" ]; then
   MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN"
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
@@ -21,6 +21,7 @@ Arguments:
                 - mistralai/Voxtral-Mini-3B-2507
                 - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
                 - google/gemma-3-4b-it
+                - nvidia/parakeet-tdt
 
   quant_name  Quantization type (required)
               Options:
@@ -35,6 +36,7 @@ Arguments:
 Examples:
   test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
   test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
+  test_model_e2e.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./model_output"
 EOF
 }
 
@@ -118,9 +120,21 @@ case "$HF_MODEL" in
     AUDIO_FILE=""
     IMAGE_PATH="docs/source/_static/img/et-logo.png"
     ;;
+  nvidia/parakeet-tdt)
+    MODEL_NAME="parakeet"
+    RUNNER_TARGET="parakeet_runner"
+    RUNNER_PATH="parakeet"
+    EXPECTED_OUTPUT="Phoebe"
+    PREPROCESSOR=""
+    TOKENIZER_URL=""
+    TOKENIZER_FILE="tokenizer.model"
+    AUDIO_URL="https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav"
+    AUDIO_FILE="test_audio.wav"
+    IMAGE_PATH=""
+    ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, nvidia/parakeet-tdt"
     exit 1
     ;;
 esac
@@ -133,13 +147,15 @@ echo "::endgroup::"
 echo "::group::Prepare $MODEL_NAME Artifacts"
 
 
-# Download tokenizer files
-if [ "$TOKENIZER_FILE" != "" ]; then
-  curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
-else
-  curl -L $TOKENIZER_URL/tokenizer.json -o $MODEL_DIR/tokenizer.json
-  curl -L $TOKENIZER_URL/tokenizer_config.json -o $MODEL_DIR/tokenizer_config.json
-  curl -L $TOKENIZER_URL/special_tokens_map.json -o $MODEL_DIR/special_tokens_map.json
+# Download tokenizer files (skip for parakeet which exports tokenizer with model)
+if [ "$MODEL_NAME" != "parakeet" ]; then
+  if [ "$TOKENIZER_FILE" != "" ]; then
+    curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
+  else
+    curl -L $TOKENIZER_URL/tokenizer.json -o $MODEL_DIR/tokenizer.json
+    curl -L $TOKENIZER_URL/tokenizer_config.json -o $MODEL_DIR/tokenizer_config.json
+    curl -L $TOKENIZER_URL/special_tokens_map.json -o $MODEL_DIR/special_tokens_map.json
+  fi
 fi
 
 # Download test files
@@ -187,23 +203,34 @@ case "$MODEL_NAME" in
   gemma3)
     RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH"
     ;;
+  parakeet)
+    RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --data_path ${MODEL_DIR}/aoti_${DEVICE}_blob.ptd --audio_path ${MODEL_DIR}/$AUDIO_FILE --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE"
+    ;;
 esac
 
 OUTPUT=$($RUNNER_BIN $RUNNER_ARGS 2>&1)
 EXIT_CODE=$?
 set -e
 
-if ! echo "$OUTPUT" | grep -iq "$EXPECTED_OUTPUT"; then
-  echo "Expected output '$EXPECTED_OUTPUT' not found in output"
-  exit 1
-else
-  echo "Success: '$EXPECTED_OUTPUT' found in output"
-fi
+echo "Runner output:"
+echo "$OUTPUT"
 
 if [ $EXIT_CODE -ne 0 ]; then
   echo "Unexpected exit code: $EXIT_CODE"
   exit $EXIT_CODE
 fi
+
+# Validate output for models that have expected output
+if [ -n "$EXPECTED_OUTPUT" ]; then
+  if ! echo "$OUTPUT" | grep -iq "$EXPECTED_OUTPUT"; then
+    echo "Expected output '$EXPECTED_OUTPUT' not found in output"
+    exit 1
+  else
+    echo "Success: '$EXPECTED_OUTPUT' found in output"
+  fi
+else
+  echo "SUCCESS: Runner completed successfully"
+fi
 echo "::endgroup::"
 
 popd
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -138,6 +138,8 @@ jobs:
             name: "whisper-large-v3-turbo"
           - repo: "google"
             name: "gemma-3-4b-it"
+          - repo: "nvidia"
+            name: "parakeet-tdt"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -148,6 +150,15 @@ jobs:
               repo: "google"
               name: "gemma-3-4b-it"
             quant: "quantized-int4-weight-only"
+          # Parakeet only supports non-quantized
+          - model:
+              repo: "nvidia"
+              name: "parakeet-tdt"
+            quant: "quantized-int4-tile-packed"
+          - model:
+              repo: "nvidia"
+              name: "parakeet-tdt"
+            quant: "quantized-int4-weight-only"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -165,12 +176,15 @@ jobs:
         ./install_executorch.sh
         echo "::endgroup::"
 
-        echo "::group::Setup Huggingface"
-        pip install -U "huggingface_hub[cli]<1.0" accelerate
-        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
-        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
-        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
-        echo "::endgroup::"
+        # Setup Huggingface only for models that need it (not parakeet)
+        if [ "${{ matrix.model.name }}" != "parakeet-tdt" ]; then
+          echo "::group::Setup Huggingface"
+          pip install -U "huggingface_hub[cli]<1.0" accelerate
+          huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+          OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+          pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+          echo "::endgroup::"
+        fi
 
         source .ci/scripts/export_model_artifact.sh cuda "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
 
@@ -193,6 +207,8 @@ jobs:
             name: "whisper-large-v3-turbo"
           - repo: "google"
             name: "gemma-3-4b-it"
+          - repo: "nvidia"
+            name: "parakeet-tdt"
         quant:
           - "non-quantized"
           - "quantized-int4-tile-packed"
@@ -203,6 +219,15 @@ jobs:
               repo: "google"
               name: "gemma-3-4b-it"
             quant: "quantized-int4-weight-only"
+          # Parakeet only supports non-quantized
+          - model:
+              repo: "nvidia"
+              name: "parakeet-tdt"
+            quant: "quantized-int4-tile-packed"
+          - model:
+              repo: "nvidia"
+              name: "parakeet-tdt"
+            quant: "quantized-int4-weight-only"
     with:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
diff --git a/examples/models/parakeet/README.md b/examples/models/parakeet/README.md
@@ -38,10 +38,21 @@ python export_parakeet_tdt.py --backend metal --output-dir ./parakeet_metal
 ```
 
 This generates:
-- `parakeet_tdt.pte` - The compiled model
+- `model.pte` - The compiled Parakeet TDT model
 - `aoti_metal_blob.ptd` - Metal kernel blob required at runtime
 - `tokenizer.model` - SentencePiece tokenizer
 
+### CUDA Export (Linux)
+
+```bash
+python export_parakeet_tdt.py --backend cuda --output-dir ./parakeet_cuda
+```
+
+This generates:
+- `model.pte` - The compiled Parakeet TDT model
+- `aoti_cuda_blob.ptd` - CUDA kernel blob required at runtime
+- `tokenizer.model` - SentencePiece tokenizer
+
 ## C++ Runner
 
 ### Building
@@ -55,7 +66,7 @@ make parakeet-cpu
 # Metal build (macOS)
 make parakeet-metal
 
-# CUDA build (Linux/Windows)
+# CUDA build (Linux)
 make parakeet-cuda
 ```
 
@@ -66,16 +77,23 @@ From the executorch root directory:
 ```bash
 # CPU/XNNPACK
 ./cmake-out/examples/models/parakeet/parakeet_runner \
-  --model_path examples/models/parakeet/parakeet_tdt_exports/parakeet_tdt.pte \
+  --model_path examples/models/parakeet/parakeet_tdt_exports/model.pte \
   --audio_path /path/to/audio.wav \
   --tokenizer_path examples/models/parakeet/parakeet_tdt_exports/tokenizer.model
 
 # Metal (include .ptd data file)
 DYLD_LIBRARY_PATH=/usr/lib ./cmake-out/examples/models/parakeet/parakeet_runner \
-  --model_path examples/models/parakeet/parakeet_metal/parakeet_tdt.pte \
+  --model_path examples/models/parakeet/parakeet_metal/model.pte \
   --data_path examples/models/parakeet/parakeet_metal/aoti_metal_blob.ptd \
   --audio_path /path/to/audio.wav \
   --tokenizer_path examples/models/parakeet/parakeet_metal/tokenizer.model
+
+# CUDA (include .ptd data file)
+./cmake-out/examples/models/parakeet/parakeet_runner \
+  --model_path examples/models/parakeet/parakeet_cuda/model.pte \
+  --data_path examples/models/parakeet/parakeet_cuda/aoti_cuda_blob.ptd \
+  --audio_path /path/to/audio.wav \
+  --tokenizer_path examples/models/parakeet/parakeet_cuda/tokenizer.model
 ```
 
 ### Runner Arguments
diff --git a/examples/models/parakeet/export_parakeet_tdt.py b/examples/models/parakeet/export_parakeet_tdt.py
@@ -7,7 +7,6 @@
 import tempfile
 
 import torch
-
 import torchaudio
 from executorch.exir import (
     EdgeCompileConfig,
@@ -297,37 +296,61 @@ def forward(
 
 
 def export_all(model):
+    """Export all model components.
+
+    The maximum audio duration is determined by the model's internal
+    max_audio_length (~50 seconds for Parakeet with max_audio_length=5000).
+    """
     programs = {}
 
+    # Get audio parameters from model config
+    sample_rate = model.preprocessor._cfg.sample_rate
+    window_stride = float(model.preprocessor._cfg.window_stride)
+
+    # Get encoder's actual limit from NeMo model
+    encoder_max_frames = model.encoder.max_audio_length  # typically 5000
+    max_audio_sec = int(encoder_max_frames * window_stride)
+
+    max_audio_samples = int(sample_rate * max_audio_sec)
+    max_mel_frames = int(max_audio_sec / window_stride)
+
     preprocessor_wrapper = PreprocessorWrapper(model.preprocessor)
     preprocessor_wrapper.eval()
-    sample_audio = torch.randn(16000 * 10)
+    sample_audio = torch.randn(max_audio_samples)
     sample_length = torch.tensor([sample_audio.shape[0]], dtype=torch.int64)
-    # The preprocessor definition changes if cuda is available (likely due to making it cuda graphable).
-    # Unfortunately that new definition is not supported by export, so we need to stop that from happening.
+    # The preprocessor uses different code paths when CUDA is available, which include
+    # data-dependent conditionals that torch.export cannot handle. Force CPU path.
     old_cuda_is_available = torch.cuda.is_available
     torch.cuda.is_available = lambda: False
     programs["preprocessor"] = export(
         preprocessor_wrapper,
         (sample_audio, sample_length),
         dynamic_shapes={
-            "audio": {0: Dim("audio_len", min=1600, max=16000 * 600)},
+            # min=1600 samples = 0.1 sec @ 16kHz, max aligned with encoder limit
+            "audio": {0: Dim("audio_len", min=1600, max=max_audio_samples)},
             "length": {},
         },
         strict=False,
     )
     torch.cuda.is_available = old_cuda_is_available
 
     feat_in = getattr(model.encoder, "_feat_in", 128)
-    audio_signal = torch.randn(1, feat_in, 100)
-    length = torch.tensor([100], dtype=torch.int64)
+    # Use max_mel_frames as example to ensure Dim.AUTO infers the full range.
+    # Smaller examples cause Dim.AUTO to infer narrow bounds.
+    audio_signal = torch.randn(1, feat_in, max_mel_frames)
+    length = torch.tensor([max_mel_frames], dtype=torch.int64)
     encoder_with_proj = EncoderWithProjection(model.encoder, model.joint)
     encoder_with_proj.eval()
+
     programs["encoder"] = export(
         encoder_with_proj,
         (),
         kwargs={"audio_signal": audio_signal, "length": length},
-        dynamic_shapes={"audio_signal": {2: Dim.AUTO}, "length": {}},
+        dynamic_shapes={
+            # Use Dim.AUTO - explicit bounds fail due to different size guards on different devices
+            "audio_signal": {2: Dim.AUTO},
+            "length": {},
+        },
         strict=False,
     )
 
@@ -553,7 +576,7 @@ def main():
 
     et = lower_to_executorch(programs, metadata=metadata, backend=args.backend)
 
-    pte_path = os.path.join(args.output_dir, "parakeet_tdt.pte")
+    pte_path = os.path.join(args.output_dir, "model.pte")
     print(f"\nSaving ExecuTorch program to: {pte_path}")
     with open(pte_path, "wb") as f:
         et.write_to_file(f)