99
1010show_help () {
1111 cat << EOF
12- Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir]
12+ Usage: export_model_artifact.sh <device> <hf_model> [quant_name] [output_dir] [mode]
1313
1414Export a HuggingFace model to CUDA/Metal/XNNPACK format with optional quantization.
1515
@@ -19,6 +19,7 @@ Arguments:
1919 hf_model HuggingFace model ID (required)
2020 Supported models:
2121 - mistralai/Voxtral-Mini-3B-2507
22+ - mistralai/Voxtral-Mini-4B-Realtime-2602
2223 - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
2324 - google/gemma-3-4b-it
2425 - nvidia/parakeet-tdt
@@ -33,13 +34,22 @@ Arguments:
3334
3435 output_dir Output directory for artifacts (optional, default: current directory)
3536
37+ mode Export mode (optional, default: auto-detect based on model and device)
38+ Supported modes:
39+ - vr-streaming: Voxtral Realtime streaming mode
40+ - vr-offline: Voxtral Realtime offline mode
41+
3642Examples:
3743 export_model_artifact.sh metal "openai/whisper-small"
3844 export_model_artifact.sh metal "nvidia/parakeet-tdt" "quantized-int4-metal"
45+ export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "quantized-int4-metal"
46+ export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
3947 export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
4048 export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
4149 export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
4250 export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output"
51+ export_model_artifact.sh xnnpack "mistralai/Voxtral-Mini-4B-Realtime-2602" "quantized-8da4w" "./output"
52+ export_model_artifact.sh xnnpack "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "./output" "vr-offline"
4353EOF
4454}
4555
@@ -60,6 +70,26 @@ DEVICE="$1"
6070HF_MODEL=" $2 "
6171QUANT_NAME=" ${3:- non-quantized} "
6272OUTPUT_DIR=" ${4:- .} "
73+ MODE=" ${5:- } "
74+
75+ # Validate mode if specified
76+ if [ -n " $MODE " ]; then
77+ case " $MODE " in
78+ vr-streaming|vr-offline)
79+ # Voxtral Realtime modes require Voxtral Realtime model
80+ if [ " $HF_MODEL " != " mistralai/Voxtral-Mini-4B-Realtime-2602" ]; then
81+ echo " Error: Mode '$MODE ' can only be used with Voxtral Realtime model"
82+ echo " Provided model: $HF_MODEL "
83+ exit 1
84+ fi
85+ ;;
86+ * )
87+ echo " Error: Unsupported mode '$MODE '"
88+ echo " Supported modes: vr-streaming, vr-offline"
89+ exit 1
90+ ;;
91+ esac
92+ fi
6393
6494case " $DEVICE " in
6595 cuda)
@@ -119,9 +149,17 @@ case "$HF_MODEL" in
119149 PREPROCESSOR_FEATURE_SIZE=" "
120150 PREPROCESSOR_OUTPUT=" "
121151 ;;
152+ mistralai/Voxtral-Mini-4B-Realtime-2602)
153+ MODEL_NAME=" voxtral_realtime"
154+ TASK=" "
155+ MAX_SEQ_LEN=" "
156+ EXTRA_PIP=" mistral-common librosa"
157+ PREPROCESSOR_FEATURE_SIZE=" "
158+ PREPROCESSOR_OUTPUT=" "
159+ ;;
122160 * )
123161 echo " Error: Unsupported model '$HF_MODEL '"
124- echo " Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, nvidia/parakeet-tdt"
162+ echo " Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, nvidia/parakeet-tdt"
125163 exit 1
126164 ;;
127165esac
@@ -201,6 +239,64 @@ if [ "$MODEL_NAME" = "parakeet" ]; then
201239 exit 0
202240fi
203241
242+ # Voxtral Realtime uses a custom export script
243+ if [ " $MODEL_NAME " = " voxtral_realtime" ]; then
244+ pip install safetensors huggingface_hub
245+
246+ # Download model weights from HuggingFace (requires HF_TOKEN for gated model)
247+ LOCAL_MODEL_DIR=" ${OUTPUT_DIR} /model_weights"
248+ python -c " from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL} ', local_dir='${LOCAL_MODEL_DIR} ')"
249+
250+ # Per-component quantization flags
251+ VR_QUANT_ARGS=" "
252+ if [ " $QUANT_NAME " = " quantized-8da4w" ]; then
253+ VR_QUANT_ARGS=" --qlinear-encoder 8da4w --qlinear 8da4w --qlinear-group-size 32 --qembedding 8w"
254+ elif [ " $QUANT_NAME " = " quantized-int4-metal" ]; then
255+ VR_QUANT_ARGS=" --qlinear-encoder fpa4w --qlinear fpa4w"
256+ fi
257+
258+ # Determine streaming mode based on MODE parameter
259+ USE_STREAMING=" false"
260+ if [ " $MODE " = " vr-streaming" ]; then
261+ USE_STREAMING=" true"
262+ elif [ " $MODE " = " vr-offline" ]; then
263+ USE_STREAMING=" false"
264+ elif [ -z " $MODE " ]; then
265+ # Auto-detect: XNNPACK uses streaming, others use offline
266+ if [ " $DEVICE " = " xnnpack" ]; then
267+ USE_STREAMING=" true"
268+ fi
269+ fi
270+
271+ # Configure export and preprocessor based on streaming mode
272+ STREAMING_ARG=" "
273+ PREPROCESSOR_ARGS=" --feature_size 128 --output_file ${OUTPUT_DIR} /preprocessor.pte"
274+ if [ " $USE_STREAMING " = " true" ]; then
275+ STREAMING_ARG=" --streaming"
276+ PREPROCESSOR_ARGS=" $PREPROCESSOR_ARGS --streaming"
277+ else
278+ PREPROCESSOR_ARGS=" $PREPROCESSOR_ARGS --stack_output --max_audio_len 300"
279+ fi
280+
281+ python -m executorch.examples.models.voxtral_realtime.export_voxtral_rt \
282+ --model-path " $LOCAL_MODEL_DIR " \
283+ --backend " $DEVICE " \
284+ ${STREAMING_ARG} \
285+ --output-dir " ${OUTPUT_DIR} " \
286+ ${VR_QUANT_ARGS}
287+
288+ # Export preprocessor
289+ python -m executorch.extension.audio.mel_spectrogram ${PREPROCESSOR_ARGS}
290+
291+ test -f " ${OUTPUT_DIR} /model.pte"
292+ test -f " ${OUTPUT_DIR} /preprocessor.pte"
293+ # Copy tokenizer from downloaded model weights
294+ cp " $LOCAL_MODEL_DIR /tekken.json" " ${OUTPUT_DIR} /tekken.json"
295+ ls -al " ${OUTPUT_DIR} "
296+ echo " ::endgroup::"
297+ exit 0
298+ fi
299+
204300MAX_SEQ_LEN_ARG=" "
205301if [ -n " $MAX_SEQ_LEN " ]; then
206302 MAX_SEQ_LEN_ARG=" --max_seq_len $MAX_SEQ_LEN "
0 commit comments