Skip to content

Commit 5e06650

Browse files
authored
[aoti-et] Add an ASR runner and an Whisper example to showcase how to use it (#15486)
**Key Changes:** * Create new ASR runner extension in `extension/asr/runner/` with reusable runner components (runner.h/cpp) * Update CMake configuration files to support ASR runner builds (executorch-config.cmake, default.cmake, llm.cmake) * Add new Whisper model example in `examples/models/whisper/` with CMake build, README, and main.cpp runner * Bump optimum-executorch commit pin for Whisper support * Update CUDA CI workflow for testing This change enables automatic speech recognition (ASR) capabilities in ExecuTorch with Whisper as the first supported model, following a similar pattern to the existing LLM runner infrastructure.
1 parent be9fc4d commit 5e06650

File tree

18 files changed

+1331
-361
lines changed

18 files changed

+1331
-361
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
467660923a5a25e4718e1d6697b93ff1bab4e807
1+
4361747abfc55e40e929396ed986efe775d745f9
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
# Export model to CUDA format with optional quantization
9+
10+
show_help() {
11+
cat << EOF
12+
Usage: export_model_cuda_artifact.sh <hf_model> [quant_name] [output_dir]
13+
14+
Export a HuggingFace model to CUDA format with optional quantization.
15+
16+
Arguments:
17+
hf_model HuggingFace model ID (required)
18+
Supported models:
19+
- mistralai/Voxtral-Mini-3B-2507
20+
- openai/whisper-small
21+
- google/gemma-3-4b-it
22+
23+
quant_name Quantization type (optional, default: non-quantized)
24+
Options:
25+
- non-quantized
26+
- quantized-int4-tile-packed
27+
- quantized-int4-weight-only
28+
29+
output_dir Output directory for artifacts (optional, default: current directory)
30+
31+
Examples:
32+
export_model_cuda_artifact.sh "openai/whisper-small"
33+
export_model_cuda_artifact.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
34+
export_model_cuda_artifact.sh "google/gemma-3-4b-it" "non-quantized" "./output"
35+
EOF
36+
}
37+
38+
if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
39+
show_help
40+
exit 0
41+
fi
42+
43+
if [ -z "${1:-}" ]; then
44+
echo "Error: hf_model argument is required"
45+
echo "Run with -h or --help for usage information"
46+
exit 1
47+
fi
48+
49+
set -eux
50+
51+
HF_MODEL="$1"
52+
QUANT_NAME="${2:-non-quantized}"
53+
OUTPUT_DIR="${3:-.}"
54+
55+
# Determine model configuration based on HF model ID
56+
case "$HF_MODEL" in
57+
mistralai/Voxtral-Mini-3B-2507)
58+
MODEL_NAME="voxtral"
59+
TASK="multimodal-text-to-text"
60+
MAX_SEQ_LEN="1024"
61+
EXTRA_PIP="mistral-common librosa"
62+
PREPROCESSOR_FEATURE_SIZE="128"
63+
PREPROCESSOR_OUTPUT="voxtral_preprocessor.pte"
64+
;;
65+
openai/whisper-small)
66+
MODEL_NAME="whisper"
67+
TASK="automatic-speech-recognition"
68+
MAX_SEQ_LEN=""
69+
EXTRA_PIP="librosa"
70+
PREPROCESSOR_FEATURE_SIZE="80"
71+
PREPROCESSOR_OUTPUT="whisper_preprocessor.pte"
72+
;;
73+
google/gemma-3-4b-it)
74+
MODEL_NAME="gemma3"
75+
TASK="multimodal-text-to-text"
76+
MAX_SEQ_LEN="64"
77+
EXTRA_PIP=""
78+
PREPROCESSOR_FEATURE_SIZE=""
79+
PREPROCESSOR_OUTPUT=""
80+
;;
81+
*)
82+
echo "Error: Unsupported model '$HF_MODEL'"
83+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
84+
exit 1
85+
;;
86+
esac
87+
88+
# Determine quantization args based on quant name
89+
case "$QUANT_NAME" in
90+
non-quantized)
91+
EXTRA_ARGS=""
92+
;;
93+
quantized-int4-tile-packed)
94+
EXTRA_ARGS="--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
95+
;;
96+
quantized-int4-weight-only)
97+
EXTRA_ARGS="--qlinear_encoder 4w"
98+
;;
99+
*)
100+
echo "Error: Unsupported quantization '$QUANT_NAME'"
101+
echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only"
102+
exit 1
103+
;;
104+
esac
105+
106+
echo "::group::Export $MODEL_NAME"
107+
108+
if [ -n "$EXTRA_PIP" ]; then
109+
pip install $EXTRA_PIP
110+
fi
111+
pip list
112+
113+
MAX_SEQ_LEN_ARG=""
114+
if [ -n "$MAX_SEQ_LEN" ]; then
115+
MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN"
116+
fi
117+
optimum-cli export executorch \
118+
--model "$HF_MODEL" \
119+
--task "$TASK" \
120+
--recipe "cuda" \
121+
--dtype bfloat16 \
122+
--device cuda \
123+
${MAX_SEQ_LEN_ARG} \
124+
${EXTRA_ARGS} \
125+
--output_dir ./
126+
127+
if [ -n "$PREPROCESSOR_OUTPUT" ]; then
128+
python -m executorch.extension.audio.mel_spectrogram \
129+
--feature_size $PREPROCESSOR_FEATURE_SIZE \
130+
--stack_output \
131+
--max_audio_len 300 \
132+
--output_file $PREPROCESSOR_OUTPUT
133+
fi
134+
135+
test -f model.pte
136+
test -f aoti_cuda_blob.ptd
137+
if [ -n "$PREPROCESSOR_OUTPUT" ]; then
138+
test -f $PREPROCESSOR_OUTPUT
139+
fi
140+
echo "::endgroup::"
141+
142+
echo "::group::Store $MODEL_NAME Artifacts"
143+
mkdir -p "${OUTPUT_DIR}"
144+
cp model.pte "${OUTPUT_DIR}/"
145+
cp aoti_cuda_blob.ptd "${OUTPUT_DIR}/"
146+
if [ -n "$PREPROCESSOR_OUTPUT" ]; then
147+
cp $PREPROCESSOR_OUTPUT "${OUTPUT_DIR}/"
148+
fi
149+
ls -al "${OUTPUT_DIR}"
150+
echo "::endgroup::"

.ci/scripts/test_model_cuda_e2e.sh

Lines changed: 207 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,207 @@
1+
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
# Test CUDA model end-to-end, need to run .ci/scripts/export_model_cuda_artifact.sh first
9+
10+
show_help() {
11+
cat << EOF
12+
Usage: test_model_cuda_e2e.sh <hf_model> <quant_name> [model_dir]
13+
14+
Build and run end-to-end tests for CUDA models.
15+
16+
Arguments:
17+
hf_model HuggingFace model ID (required)
18+
Supported models:
19+
- mistralai/Voxtral-Mini-3B-2507
20+
- openai/whisper-small
21+
- google/gemma-3-4b-it
22+
23+
quant_name Quantization type (required)
24+
Options:
25+
- non-quantized
26+
- quantized-int4-tile-packed
27+
- quantized-int4-weight-only
28+
29+
model_dir Directory containing model artifacts (optional, default: current directory)
30+
Expected files: model.pte, aoti_cuda_blob.ptd
31+
Tokenizers and test files will be downloaded to this directory
32+
33+
Examples:
34+
test_model_cuda_e2e.sh "openai/whisper-small" "non-quantized"
35+
test_model_cuda_e2e.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
36+
EOF
37+
}
38+
39+
if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
40+
show_help
41+
exit 0
42+
fi
43+
44+
if [ -z "${1:-}" ]; then
45+
echo "Error: hf_model argument is required"
46+
echo "Run with -h or --help for usage information"
47+
exit 1
48+
fi
49+
50+
if [ -z "${2:-}" ]; then
51+
echo "Error: quant_name argument is required"
52+
echo "Run with -h or --help for usage information"
53+
exit 1
54+
fi
55+
56+
set -eux
57+
58+
HF_MODEL="$1"
59+
QUANT_NAME="$2"
60+
# Download tokenizers, audio, and image files to this directory
61+
MODEL_DIR="${3:-.}"
62+
63+
echo "Testing model: $HF_MODEL (quantization: $QUANT_NAME)"
64+
65+
# Make sure model.pte and aoti_cuda_blob.ptd exist
66+
if [ ! -f "$MODEL_DIR/model.pte" ]; then
67+
echo "Error: model.pte not found in $MODEL_DIR"
68+
exit 1
69+
fi
70+
if [ ! -f "$MODEL_DIR/aoti_cuda_blob.ptd" ]; then
71+
echo "Error: aoti_cuda_blob.ptd not found in $MODEL_DIR"
72+
exit 1
73+
fi
74+
# Locate EXECUTORCH_ROOT from the directory of this script
75+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
76+
EXECUTORCH_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
77+
78+
pushd "$EXECUTORCH_ROOT"
79+
80+
# Determine model configuration based on HF model ID
81+
case "$HF_MODEL" in
82+
mistralai/Voxtral-Mini-3B-2507)
83+
MODEL_NAME="voxtral"
84+
RUNNER_TARGET="voxtral_runner"
85+
RUNNER_PATH="voxtral"
86+
EXPECTED_OUTPUT="poem"
87+
PREPROCESSOR="voxtral_preprocessor.pte"
88+
TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main" # @lint-ignore
89+
TOKENIZER_FILE="tekken.json"
90+
AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
91+
AUDIO_FILE="poem.wav"
92+
IMAGE_PATH=""
93+
;;
94+
openai/whisper-small)
95+
MODEL_NAME="whisper"
96+
RUNNER_TARGET="whisper_runner"
97+
RUNNER_PATH="whisper"
98+
EXPECTED_OUTPUT="Mr. Quilter is the apostle of the middle classes"
99+
PREPROCESSOR="whisper_preprocessor.pte"
100+
TOKENIZER_URL="https://huggingface.co/openai/whisper-small/resolve/main" # @lint-ignore
101+
TOKENIZER_FILE=""
102+
AUDIO_URL=""
103+
AUDIO_FILE="output.wav"
104+
IMAGE_PATH=""
105+
;;
106+
google/gemma-3-4b-it)
107+
MODEL_NAME="gemma3"
108+
RUNNER_TARGET="gemma3_e2e_runner"
109+
RUNNER_PATH="gemma3"
110+
EXPECTED_OUTPUT="chip"
111+
PREPROCESSOR=""
112+
TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-4b-it/resolve/main" # @lint-ignore
113+
TOKENIZER_FILE=""
114+
AUDIO_URL=""
115+
AUDIO_FILE=""
116+
IMAGE_PATH="docs/source/_static/img/et-logo.png"
117+
;;
118+
*)
119+
echo "Error: Unsupported model '$HF_MODEL'"
120+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
121+
exit 1
122+
;;
123+
esac
124+
125+
echo "::group::Setup ExecuTorch Requirements"
126+
./install_requirements.sh
127+
pip list
128+
echo "::endgroup::"
129+
130+
echo "::group::Prepare $MODEL_NAME Artifacts"
131+
132+
133+
# Download tokenizer files
134+
if [ "$TOKENIZER_FILE" != "" ]; then
135+
curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
136+
else
137+
curl -L $TOKENIZER_URL/tokenizer.json -o $MODEL_DIR/tokenizer.json
138+
curl -L $TOKENIZER_URL/tokenizer_config.json -o $MODEL_DIR/tokenizer_config.json
139+
curl -L $TOKENIZER_URL/special_tokens_map.json -o $MODEL_DIR/special_tokens_map.json
140+
fi
141+
142+
# Download test files
143+
if [ "$AUDIO_URL" != "" ]; then
144+
curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE
145+
elif [ "$MODEL_NAME" = "whisper" ]; then
146+
conda install -y -c conda-forge "ffmpeg<8"
147+
pip install datasets soundfile torchcodec
148+
python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
149+
fi
150+
151+
ls -al
152+
echo "::endgroup::"
153+
154+
echo "::group::Build $MODEL_NAME Runner"
155+
cmake --preset llm \
156+
-DEXECUTORCH_BUILD_CUDA=ON \
157+
-DCMAKE_INSTALL_PREFIX=cmake-out \
158+
-DCMAKE_BUILD_TYPE=Release \
159+
-Bcmake-out -S.
160+
cmake --build cmake-out -j$(nproc) --target install --config Release
161+
162+
cmake -DEXECUTORCH_BUILD_CUDA=ON \
163+
-DCMAKE_BUILD_TYPE=Release \
164+
-Sexamples/models/$RUNNER_PATH \
165+
-Bcmake-out/examples/models/$RUNNER_PATH/
166+
cmake --build cmake-out/examples/models/$RUNNER_PATH --target $RUNNER_TARGET --config Release
167+
echo "::endgroup::"
168+
169+
echo "::group::Run $MODEL_NAME Runner"
170+
set +e
171+
export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
172+
173+
# Build runner command with common arguments
174+
RUNNER_BIN="cmake-out/examples/models/$RUNNER_PATH/$RUNNER_TARGET"
175+
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd --temperature 0"
176+
177+
# Add model-specific arguments
178+
case "$MODEL_NAME" in
179+
voxtral)
180+
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
181+
;;
182+
whisper)
183+
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
184+
;;
185+
gemma3)
186+
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH"
187+
;;
188+
esac
189+
190+
OUTPUT=$($RUNNER_BIN $RUNNER_ARGS 2>&1)
191+
EXIT_CODE=$?
192+
set -e
193+
194+
if ! echo "$OUTPUT" | grep -iq "$EXPECTED_OUTPUT"; then
195+
echo "Expected output '$EXPECTED_OUTPUT' not found in output"
196+
exit 1
197+
else
198+
echo "Success: '$EXPECTED_OUTPUT' found in output"
199+
fi
200+
201+
if [ $EXIT_CODE -ne 0 ]; then
202+
echo "Unexpected exit code: $EXIT_CODE"
203+
exit $EXIT_CODE
204+
fi
205+
echo "::endgroup::"
206+
207+
popd

0 commit comments

Comments
 (0)