init

Gasoonjia · Gasoonjia · commit 0b4d0bc15514 · 2025-11-07T10:14:52.000-08:00
diff --git a/.ci/scripts/export_model_cuda_artifact.sh b/.ci/scripts/export_model_cuda_artifact.sh
@@ -18,6 +18,7 @@ Arguments:
                Supported models:
                  - mistralai/Voxtral-Mini-3B-2507
                  - openai/whisper-small
+                 - openai/whisper-large-v2
                  - google/gemma-3-4b-it
 
   quant_name   Quantization type (optional, default: non-quantized)
@@ -62,7 +63,7 @@ case "$HF_MODEL" in
     PREPROCESSOR_FEATURE_SIZE="128"
     PREPROCESSOR_OUTPUT="voxtral_preprocessor.pte"
     ;;
-  openai/whisper-small)
+  openai/whisper-*)
     MODEL_NAME="whisper"
     TASK="automatic-speech-recognition"
     MAX_SEQ_LEN=""
@@ -80,7 +81,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, openai/whisper-large-v2, google/gemma-3-4b-it"
     exit 1
     ;;
 esac
diff --git a/.ci/scripts/test_model_cuda_e2e.sh b/.ci/scripts/test_model_cuda_e2e.sh
@@ -18,6 +18,7 @@ Arguments:
               Supported models:
                 - mistralai/Voxtral-Mini-3B-2507
                 - openai/whisper-small
+                - openai/whisper-large-v2
                 - google/gemma-3-4b-it
 
   quant_name  Quantization type (required)
@@ -91,13 +92,13 @@ case "$HF_MODEL" in
     AUDIO_FILE="poem.wav"
     IMAGE_PATH=""
     ;;
-  openai/whisper-small)
-    MODEL_NAME="whisper"
+  openai/whisper-*)
+    MODEL_NAME="${HF_MODEL#openai/}"
     RUNNER_TARGET="whisper_runner"
     RUNNER_PATH="whisper"
     EXPECTED_OUTPUT="Mr. Quilter is the apostle of the middle classes"
     PREPROCESSOR="whisper_preprocessor.pte"
-    TOKENIZER_URL="https://huggingface.co/openai/whisper-small/resolve/main" # @lint-ignore
+    TOKENIZER_URL="https://huggingface.co/${HF_MODEL}/resolve/main" # @lint-ignore
     TOKENIZER_FILE=""
     AUDIO_URL=""
     AUDIO_FILE="output.wav"
@@ -117,7 +118,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, openai/whisper-large-v2, google/gemma-3-4b-it"
     exit 1
     ;;
 esac
@@ -142,7 +143,7 @@ fi
 # Download test files
 if [ "$AUDIO_URL" != "" ]; then
   curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE
-elif [ "$MODEL_NAME" = "whisper" ]; then
+elif [[ "$MODEL_NAME" == *whisper* ]]; then
   conda install -y -c conda-forge "ffmpeg<8"
   pip install datasets soundfile torchcodec
   python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
@@ -180,7 +181,7 @@ case "$MODEL_NAME" in
     RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
     ;;
   whisper)
-    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
+    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR --model_name ${WHISPER_MODEL_NAME}"
     ;;
   gemma3)
     RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH"
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
@@ -104,6 +104,8 @@ jobs:
             name: "Voxtral-Mini-3B-2507"
           - repo: "openai"
             name: "whisper-small"
+          - repo: "openai"
+            name: "whisper-large-v2"
           - repo: "google"
             name: "gemma-3-4b-it"
         quant:
@@ -223,6 +225,8 @@ jobs:
             name: "Voxtral-Mini-3B-2507"
           - repo: "openai"
             name: "whisper-small"
+          - repo: "openai"
+            name: "whisper-large-v2"
           - repo: "google"
             name: "gemma-3-4b-it"
         quant:
diff --git a/examples/models/whisper/main.cpp b/examples/models/whisper/main.cpp
@@ -39,6 +39,10 @@ DEFINE_string(
     audio_path,
     "",
     "Path to input audio file. Accepts .wav or raw float .bin.");
+DEFINE_string(
+    model_name,
+    "base",
+    "Whisper model name (base, small, medium, large, large-v2, large-v3, turbo).");
 DEFINE_double(
     temperature,
     0.0,
@@ -109,7 +113,22 @@ int main(int argc, char** argv) {
   executorch::extension::asr::AsrTranscribeConfig config;
   config.max_new_tokens = FLAGS_max_new_tokens;
   config.temperature = static_cast<float>(FLAGS_temperature);
-  config.decoder_start_token_id = 50257;
+
+  // Set decoder_start_token_id based on model version
+  if (FLAGS_model_name == "large-v2" || FLAGS_model_name == "large-v3" ||
+      FLAGS_model_name == "turbo") {
+    config.decoder_start_token_id = 50258;
+    ET_LOG(
+        Info,
+        "Using decoder_start_token_id=50258 for model: %s",
+        FLAGS_model_name.c_str());
+  } else {
+    config.decoder_start_token_id = 50257;
+    ET_LOG(
+        Info,
+        "Using decoder_start_token_id=50257 for model: %s",
+        FLAGS_model_name.c_str());
+  }
 
   auto result =
       runner.transcribe(features, config, [&](const std::string& piece) {
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
@@ -193,7 +193,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
           "Conversion complete, first value = %f",
           static_cast<float>(
               preprocessed_features
-                  ->mutable_data_ptr<::executorch::aten::BFloat16>()[0]));
+                  ->mutable_data_ptr<float>()[0]));
     }
   }
 
@@ -225,7 +225,7 @@ Result<std::vector<int64_t>> AsrRunner::transcribe(
       "Encoder first value: %f",
       static_cast<float>(
           encoder_output_tensor
-              .mutable_data_ptr<::executorch::aten::BFloat16>()[0]));
+              .mutable_data_ptr<float>()[0]));
 
   auto encoder_output_ptr = std::make_shared<::executorch::aten::Tensor>(
       std::move(encoder_output_tensor));