diff --git a/.ci/docker/ci_commit_pins/optimum-executorch.txt b/.ci/docker/ci_commit_pins/optimum-executorch.txt
index e0af9344588..574ccb745d0 100644
--- a/.ci/docker/ci_commit_pins/optimum-executorch.txt
+++ b/.ci/docker/ci_commit_pins/optimum-executorch.txt
@@ -1 +1 @@
-4361747abfc55e40e929396ed986efe775d745f9
+467660923a5a25e4718e1d6697b93ff1bab4e807
diff --git a/.ci/scripts/export_model_cuda_artifact.sh b/.ci/scripts/export_model_cuda_artifact.sh
deleted file mode 100755
index 85e34ae5b80..00000000000
--- a/.ci/scripts/export_model_cuda_artifact.sh
+++ /dev/null
@@ -1,150 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Export model to CUDA format with optional quantization
-
-show_help() {
-  cat << EOF
-Usage: export_model_cuda_artifact.sh <hf_model> [quant_name] [output_dir]
-
-Export a HuggingFace model to CUDA format with optional quantization.
-
-Arguments:
-  hf_model     HuggingFace model ID (required)
-               Supported models:
-                 - mistralai/Voxtral-Mini-3B-2507
-                 - openai/whisper-small
-                 - google/gemma-3-4b-it
-
-  quant_name   Quantization type (optional, default: non-quantized)
-               Options:
-                 - non-quantized
-                 - quantized-int4-tile-packed
-                 - quantized-int4-weight-only
-
-  output_dir   Output directory for artifacts (optional, default: current directory)
-
-Examples:
-  export_model_cuda_artifact.sh "openai/whisper-small"
-  export_model_cuda_artifact.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
-  export_model_cuda_artifact.sh "google/gemma-3-4b-it" "non-quantized" "./output"
-EOF
-}
-
-if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
-  show_help
-  exit 0
-fi
-
-if [ -z "${1:-}" ]; then
-  echo "Error: hf_model argument is required"
-  echo "Run with -h or --help for usage information"
-  exit 1
-fi
-
-set -eux
-
-HF_MODEL="$1"
-QUANT_NAME="${2:-non-quantized}"
-OUTPUT_DIR="${3:-.}"
-
-# Determine model configuration based on HF model ID
-case "$HF_MODEL" in
-  mistralai/Voxtral-Mini-3B-2507)
-    MODEL_NAME="voxtral"
-    TASK="multimodal-text-to-text"
-    MAX_SEQ_LEN="1024"
-    EXTRA_PIP="mistral-common librosa"
-    PREPROCESSOR_FEATURE_SIZE="128"
-    PREPROCESSOR_OUTPUT="voxtral_preprocessor.pte"
-    ;;
-  openai/whisper-small)
-    MODEL_NAME="whisper"
-    TASK="automatic-speech-recognition"
-    MAX_SEQ_LEN=""
-    EXTRA_PIP="librosa"
-    PREPROCESSOR_FEATURE_SIZE="80"
-    PREPROCESSOR_OUTPUT="whisper_preprocessor.pte"
-    ;;
-  google/gemma-3-4b-it)
-    MODEL_NAME="gemma3"
-    TASK="multimodal-text-to-text"
-    MAX_SEQ_LEN="64"
-    EXTRA_PIP=""
-    PREPROCESSOR_FEATURE_SIZE=""
-    PREPROCESSOR_OUTPUT=""
-    ;;
-  *)
-    echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
-    exit 1
-    ;;
-esac
-
-# Determine quantization args based on quant name
-case "$QUANT_NAME" in
-  non-quantized)
-    EXTRA_ARGS=""
-    ;;
-  quantized-int4-tile-packed)
-    EXTRA_ARGS="--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
-    ;;
-  quantized-int4-weight-only)
-    EXTRA_ARGS="--qlinear_encoder 4w"
-    ;;
-  *)
-    echo "Error: Unsupported quantization '$QUANT_NAME'"
-    echo "Supported quantizations: non-quantized, quantized-int4-tile-packed, quantized-int4-weight-only"
-    exit 1
-    ;;
-esac
-
-echo "::group::Export $MODEL_NAME"
-
-if [ -n "$EXTRA_PIP" ]; then
-  pip install $EXTRA_PIP
-fi
-pip list
-
-MAX_SEQ_LEN_ARG=""
-if [ -n "$MAX_SEQ_LEN" ]; then
-  MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN"
-fi
-optimum-cli export executorch \
-    --model "$HF_MODEL" \
-    --task "$TASK" \
-    --recipe "cuda" \
-    --dtype bfloat16 \
-    --device cuda \
-    ${MAX_SEQ_LEN_ARG} \
-    ${EXTRA_ARGS} \
-    --output_dir ./
-
-if [ -n "$PREPROCESSOR_OUTPUT" ]; then
-  python -m executorch.extension.audio.mel_spectrogram \
-      --feature_size $PREPROCESSOR_FEATURE_SIZE \
-      --stack_output \
-      --max_audio_len 300 \
-      --output_file $PREPROCESSOR_OUTPUT
-fi
-
-test -f model.pte
-test -f aoti_cuda_blob.ptd
-if [ -n "$PREPROCESSOR_OUTPUT" ]; then
-  test -f $PREPROCESSOR_OUTPUT
-fi
-echo "::endgroup::"
-
-echo "::group::Store $MODEL_NAME Artifacts"
-mkdir -p "${OUTPUT_DIR}"
-cp model.pte "${OUTPUT_DIR}/"
-cp aoti_cuda_blob.ptd "${OUTPUT_DIR}/"
-if [ -n "$PREPROCESSOR_OUTPUT" ]; then
-  cp $PREPROCESSOR_OUTPUT "${OUTPUT_DIR}/"
-fi
-ls -al "${OUTPUT_DIR}"
-echo "::endgroup::"
diff --git a/.ci/scripts/test_model_cuda_e2e.sh b/.ci/scripts/test_model_cuda_e2e.sh
deleted file mode 100755
index 02845bf4b96..00000000000
--- a/.ci/scripts/test_model_cuda_e2e.sh
+++ /dev/null
@@ -1,207 +0,0 @@
-#!/bin/bash
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# Test CUDA model end-to-end, need to run .ci/scripts/export_model_cuda_artifact.sh first
-
-show_help() {
-  cat << EOF
-Usage: test_model_cuda_e2e.sh <hf_model> <quant_name> [model_dir]
-
-Build and run end-to-end tests for CUDA models.
-
-Arguments:
-  hf_model    HuggingFace model ID (required)
-              Supported models:
-                - mistralai/Voxtral-Mini-3B-2507
-                - openai/whisper-small
-                - google/gemma-3-4b-it
-
-  quant_name  Quantization type (required)
-              Options:
-                - non-quantized
-                - quantized-int4-tile-packed
-                - quantized-int4-weight-only
-
-  model_dir   Directory containing model artifacts (optional, default: current directory)
-              Expected files: model.pte, aoti_cuda_blob.ptd
-              Tokenizers and test files will be downloaded to this directory
-
-Examples:
-  test_model_cuda_e2e.sh "openai/whisper-small" "non-quantized"
-  test_model_cuda_e2e.sh "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
-EOF
-}
-
-if [ "${1:-}" = "-h" ] || [ "${1:-}" = "--help" ]; then
-  show_help
-  exit 0
-fi
-
-if [ -z "${1:-}" ]; then
-  echo "Error: hf_model argument is required"
-  echo "Run with -h or --help for usage information"
-  exit 1
-fi
-
-if [ -z "${2:-}" ]; then
-  echo "Error: quant_name argument is required"
-  echo "Run with -h or --help for usage information"
-  exit 1
-fi
-
-set -eux
-
-HF_MODEL="$1"
-QUANT_NAME="$2"
-# Download tokenizers, audio, and image files to this directory
-MODEL_DIR="${3:-.}"
-
-echo "Testing model: $HF_MODEL (quantization: $QUANT_NAME)"
-
-# Make sure model.pte and aoti_cuda_blob.ptd exist
-if [ ! -f "$MODEL_DIR/model.pte" ]; then
-  echo "Error: model.pte not found in $MODEL_DIR"
-  exit 1
-fi
-if [ ! -f "$MODEL_DIR/aoti_cuda_blob.ptd" ]; then
-  echo "Error: aoti_cuda_blob.ptd not found in $MODEL_DIR"
-  exit 1
-fi
-# Locate EXECUTORCH_ROOT from the directory of this script
-SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
-EXECUTORCH_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
-
-pushd "$EXECUTORCH_ROOT"
-
-# Determine model configuration based on HF model ID
-case "$HF_MODEL" in
-  mistralai/Voxtral-Mini-3B-2507)
-    MODEL_NAME="voxtral"
-    RUNNER_TARGET="voxtral_runner"
-    RUNNER_PATH="voxtral"
-    EXPECTED_OUTPUT="poem"
-    PREPROCESSOR="voxtral_preprocessor.pte"
-    TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main" # @lint-ignore
-    TOKENIZER_FILE="tekken.json"
-    AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
-    AUDIO_FILE="poem.wav"
-    IMAGE_PATH=""
-    ;;
-  openai/whisper-small)
-    MODEL_NAME="whisper"
-    RUNNER_TARGET="whisper_runner"
-    RUNNER_PATH="whisper"
-    EXPECTED_OUTPUT="Mr. Quilter is the apostle of the middle classes"
-    PREPROCESSOR="whisper_preprocessor.pte"
-    TOKENIZER_URL="https://huggingface.co/openai/whisper-small/resolve/main" # @lint-ignore
-    TOKENIZER_FILE=""
-    AUDIO_URL=""
-    AUDIO_FILE="output.wav"
-    IMAGE_PATH=""
-    ;;
-  google/gemma-3-4b-it)
-    MODEL_NAME="gemma3"
-    RUNNER_TARGET="gemma3_e2e_runner"
-    RUNNER_PATH="gemma3"
-    EXPECTED_OUTPUT="chip"
-    PREPROCESSOR=""
-    TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-4b-it/resolve/main" # @lint-ignore
-    TOKENIZER_FILE=""
-    AUDIO_URL=""
-    AUDIO_FILE=""
-    IMAGE_PATH="docs/source/_static/img/et-logo.png"
-    ;;
-  *)
-    echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, openai/whisper-small, google/gemma-3-4b-it"
-    exit 1
-    ;;
-esac
-
-echo "::group::Setup ExecuTorch Requirements"
-./install_requirements.sh
-pip list
-echo "::endgroup::"
-
-echo "::group::Prepare $MODEL_NAME Artifacts"
-
-
-# Download tokenizer files
-if [ "$TOKENIZER_FILE" != "" ]; then
-  curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
-else
-  curl -L $TOKENIZER_URL/tokenizer.json -o $MODEL_DIR/tokenizer.json
-  curl -L $TOKENIZER_URL/tokenizer_config.json -o $MODEL_DIR/tokenizer_config.json
-  curl -L $TOKENIZER_URL/special_tokens_map.json -o $MODEL_DIR/special_tokens_map.json
-fi
-
-# Download test files
-if [ "$AUDIO_URL" != "" ]; then
-  curl -L $AUDIO_URL -o ${MODEL_DIR}/$AUDIO_FILE
-elif [ "$MODEL_NAME" = "whisper" ]; then
-  conda install -y -c conda-forge "ffmpeg<8"
-  pip install datasets soundfile torchcodec
-  python -c "from datasets import load_dataset;import soundfile as sf;sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio'];sf.write('${MODEL_DIR}/$AUDIO_FILE', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
-fi
-
-ls -al
-echo "::endgroup::"
-
-echo "::group::Build $MODEL_NAME Runner"
-cmake --preset llm \
-      -DEXECUTORCH_BUILD_CUDA=ON \
-      -DCMAKE_INSTALL_PREFIX=cmake-out \
-      -DCMAKE_BUILD_TYPE=Release \
-      -Bcmake-out -S.
-cmake --build cmake-out -j$(nproc) --target install --config Release
-
-cmake -DEXECUTORCH_BUILD_CUDA=ON \
-      -DCMAKE_BUILD_TYPE=Release \
-      -Sexamples/models/$RUNNER_PATH \
-      -Bcmake-out/examples/models/$RUNNER_PATH/
-cmake --build cmake-out/examples/models/$RUNNER_PATH --target $RUNNER_TARGET --config Release
-echo "::endgroup::"
-
-echo "::group::Run $MODEL_NAME Runner"
-set +e
-export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
-
-# Build runner command with common arguments
-RUNNER_BIN="cmake-out/examples/models/$RUNNER_PATH/$RUNNER_TARGET"
-RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd --temperature 0"
-
-# Add model-specific arguments
-case "$MODEL_NAME" in
-  voxtral)
-    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
-    ;;
-  whisper)
-    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --audio_path ${MODEL_DIR}/$AUDIO_FILE --processor_path ${MODEL_DIR}/$PREPROCESSOR"
-    ;;
-  gemma3)
-    RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/ --image_path $IMAGE_PATH"
-    ;;
-esac
-
-OUTPUT=$($RUNNER_BIN $RUNNER_ARGS 2>&1)
-EXIT_CODE=$?
-set -e
-
-if ! echo "$OUTPUT" | grep -iq "$EXPECTED_OUTPUT"; then
-  echo "Expected output '$EXPECTED_OUTPUT' not found in output"
-  exit 1
-else
-  echo "Success: '$EXPECTED_OUTPUT' found in output"
-fi
-
-if [ $EXIT_CODE -ne 0 ]; then
-  echo "Unexpected exit code: $EXIT_CODE"
-  exit $EXIT_CODE
-fi
-echo "::endgroup::"
-
-popd
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 356180772c4..0b2424f3bf0 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -87,8 +87,8 @@ jobs:
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
         PYTHON_EXECUTABLE=python source .ci/scripts/test_model.sh "${{ matrix.model }}" cmake cuda
 
-  export-model-cuda-artifact:
-    name: export-model-cuda-artifact
+  export-voxtral-cuda-artifact:
+    name: export-voxtral-cuda-${{ matrix.quant.name }}
     # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
     if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
@@ -99,23 +99,17 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        model:
-          - repo: "mistralai"
-            name: "Voxtral-Mini-3B-2507"
-          - repo: "openai"
-            name: "whisper-small"
-          - repo: "google"
-            name: "gemma-3-4b-it"
         quant:
-          - "non-quantized"
-          - "quantized-int4-tile-packed"
-          - "quantized-int4-weight-only"
-        exclude:
-          # TODO: enable int4-weight-only on gemma3.
-          - model:
-              repo: "google"
-              name: "gemma-3-4b-it"
-            quant: "quantized-int4-weight-only"
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+            extra_args: ""
+          - name: "quantized-int4-tile-packed"
+            artifact: "voxtral-cuda-quantized-int4-tile-packed"
+            extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
+          - name: "quantized-int4-weight-only"
+            artifact: "voxtral-cuda-quantized-int4-weight-only"
+            # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
+            extra_args: "--qlinear_encoder 4w"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -124,7 +118,7 @@ jobs:
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
-      upload-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }}
+      upload-artifact: ${{ matrix.quant.artifact }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -138,43 +132,128 @@ jobs:
         huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
         pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        pip install mistral-common librosa
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Export Voxtral (${{ matrix.quant.name }})"
+        EXTRA_ARGS="${{ matrix.quant.extra_args }}"
+        optimum-cli export executorch \
+            --model "mistralai/Voxtral-Mini-3B-2507" \
+            --task "multimodal-text-to-text" \
+            --recipe "cuda" \
+            --dtype bfloat16 \
+            --device cuda \
+            --max_seq_len 1024 \
+            ${EXTRA_ARGS} \
+            --output_dir ./
+        python -m executorch.extension.audio.mel_spectrogram \
+            --feature_size 128 \
+            --stack_output \
+            --max_audio_len 300 \
+            --output_file voxtral_preprocessor.pte
+
+        test -f model.pte
+        test -f aoti_cuda_blob.ptd
+        test -f voxtral_preprocessor.pte
         echo "::endgroup::"
 
-        source .ci/scripts/export_model_cuda_artifact.sh "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
+        echo "::group::Store Voxtral Artifacts (${{ matrix.quant.name }})"
+        mkdir -p "${RUNNER_ARTIFACT_DIR}"
+        cp model.pte "${RUNNER_ARTIFACT_DIR}/"
+        cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
+        cp voxtral_preprocessor.pte "${RUNNER_ARTIFACT_DIR}/"
+        ls -al "${RUNNER_ARTIFACT_DIR}"
+        echo "::endgroup::"
 
-  benchmark-model-cuda:
-    name: benchmark-model-cuda
-    needs: export-model-cuda-artifact
+  export-gemma3-cuda-artifact:
+    name: export-gemma3-cuda-${{ matrix.quant.name }}
+    # Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
+    if: github.event.pull_request.head.repo.full_name == github.repository || github.event_name != 'pull_request'
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
       contents: read
+    secrets: inherit
     strategy:
       fail-fast: false
       matrix:
-        model:
-          - repo: "mistralai"
-            name: "Voxtral-Mini-3B-2507"
-          - repo: "google"
-            name: "gemma-3-4b-it"
         quant:
-          - "non-quantized"
-          - "quantized-int4-tile-packed"
-          - "quantized-int4-weight-only"
-        exclude:
+          - name: "non-quantized"
+            artifact: "gemma3-cuda-export"
+            extra_args: ""
+          - name: "quantized-int4-tile-packed"
+            artifact: "gemma3-cuda-quantized-int4-tile-packed"
+            extra_args: "--qlinear 4w --qlinear_encoder 4w --qlinear_packing_format tile_packed_to_4d --qlinear_encoder_packing_format tile_packed_to_4d"
           # TODO: enable int4-weight-only on gemma3.
-          - model:
-              repo: "google"
-              name: "gemma-3-4b-it"
-            quant: "quantized-int4-weight-only"
+          # - name: "quantized-int4-weight-only"
+          #   artifact: "voxtral-cuda-quantized-int4-weight-only"
+          #   # TODO: adding "--qlinear 4w" produces invalid results. Need further investigation.
+          #   extra_args: "--qlinear_encoder 4w"
     with:
       timeout: 90
+      secrets-env: EXECUTORCH_HF_TOKEN
       runner: linux.g5.4xlarge.nvidia.gpu
       gpu-arch-type: cuda
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
-      download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }}
+      upload-artifact: ${{ matrix.quant.artifact }}
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch"
+        ./install_executorch.sh
+        echo "::endgroup::"
+
+        echo "::group::Setup Huggingface"
+        pip install -U "huggingface_hub[cli]<1.0" accelerate
+        huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
+        OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
+        pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Export Gemma3 (${{ matrix.quant.name }})"
+        EXTRA_ARGS="${{ matrix.quant.extra_args }}"
+        optimum-cli export executorch \
+            --model "google/gemma-3-4b-it" \
+            --task "multimodal-text-to-text" \
+            --recipe "cuda" \
+            --dtype bfloat16 \
+            --device cuda \
+            --max_seq_len 64 \
+            --output_dir ./
+
+        test -f model.pte
+        test -f aoti_cuda_blob.ptd
+        echo "::endgroup::"
+
+        echo "::group::Store Gemma3 Artifacts (${{ matrix.quant.name }})"
+        mkdir -p "${RUNNER_ARTIFACT_DIR}/"
+        cp model.pte "${RUNNER_ARTIFACT_DIR}/"
+        cp aoti_cuda_blob.ptd "${RUNNER_ARTIFACT_DIR}/"
+        ls -al "${RUNNER_ARTIFACT_DIR}/"
+        echo "::endgroup::"
+
+  benchmark-voxtral-cuda:
+    name: benchmark-voxtral-cuda
+    needs: export-voxtral-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: voxtral-cuda-export
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
         set -eux
@@ -184,13 +263,13 @@ jobs:
         pip list
         echo "::endgroup::"
 
-        echo "::group::Prepare ${{ matrix.model }} Artifacts"
+        echo "::group::Prepare Voxtral Artifacts"
         cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
         cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
         ls -al model.pte aoti_cuda_blob.ptd
         echo "::endgroup::"
 
-        echo "::group::Build ${{ matrix.model }} Benchmark"
+        echo "::group::Build Voxtral Benchmark"
         cmake -DCMAKE_BUILD_TYPE=Release \
               -DEXECUTORCH_BUILD_CUDA=ON \
               -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
@@ -198,19 +277,69 @@ jobs:
               -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
               -DEXECUTORCH_BUILD_TESTS=ON \
               -Bcmake-out .
-        cmake --build cmake-out -j$(nproc) --target multimodal_benchmark
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
         echo "::endgroup::"
 
-        echo "::group::Run ${{ matrix.model.name }} Benchmark"
+        echo "::group::Run Voxtral Benchmark"
 
         export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
-        cmake-out/backends/cuda/multimodal_benchmark ${{ matrix.model.name }} model.pte aoti_cuda_blob.ptd
+        cmake-out/backends/cuda/multimodal_benchmark voxtral model.pte aoti_cuda_blob.ptd
 
         echo "::endgroup::"
 
-  test-model-cuda-e2e:
-    name: test-model-cuda-e2e
-    needs: export-model-cuda-artifact
+  benchmark-gemma3-cuda:
+    name: benchmark-gemma3-cuda
+    needs: export-gemma3-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: gemma3-cuda-export
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Gemma3 Artifacts"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        ls -al model.pte aoti_cuda_blob.ptd
+        echo "::endgroup::"
+
+        echo "::group::Build Gemma3 Benchmark"
+        cmake -DCMAKE_BUILD_TYPE=Release \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+              -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
+              -DEXECUTORCH_BUILD_TESTS=ON \
+              -Bcmake-out .
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target multimodal_benchmark
+        echo "::endgroup::"
+
+        echo "::group::Run Gemma3 Benchmark"
+
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        cmake-out/backends/cuda/multimodal_benchmark gemma3 model.pte aoti_cuda_blob.ptd
+
+        echo "::endgroup::"
+
+  test-voxtral-cuda-e2e:
+    name: test-voxtral-cuda-e2e-${{ matrix.format.name }}
+    needs: export-voxtral-cuda-artifact
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -218,23 +347,103 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        model:
-          - repo: "mistralai"
-            name: "Voxtral-Mini-3B-2507"
-          - repo: "openai"
-            name: "whisper-small"
-          - repo: "google"
-            name: "gemma-3-4b-it"
-        quant:
-          - "non-quantized"
-          - "quantized-int4-tile-packed"
-          - "quantized-int4-weight-only"
-        exclude:
+        format:
+          - name: "non-quantized"
+            artifact: "voxtral-cuda-export"
+          - name: "quantized-int4-tile-packed"
+            artifact: "voxtral-cuda-quantized-int4-tile-packed"
+          - name: "quantized-int4-weight-only"
+            artifact: "voxtral-cuda-quantized-int4-weight-only"
+    with:
+      timeout: 90
+      runner: linux.g5.4xlarge.nvidia.gpu
+      gpu-arch-type: cuda
+      gpu-arch-version: 12.6
+      use-custom-docker-registry: false
+      submodules: recursive
+      download-artifact: ${{ matrix.format.artifact }}
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      script: |
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Voxtral Artifacts (${{ matrix.format.name }})"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        cp "${RUNNER_ARTIFACT_DIR}/voxtral_preprocessor.pte" .
+        TOKENIZER_URL="https://huggingface.co/mistralai/Voxtral-Mini-3B-2507/resolve/main/tekken.json"
+        curl -L $TOKENIZER_URL -o tekken.json
+        ls -al model.pte aoti_cuda_blob.ptd voxtral_preprocessor.pte tekken.json
+        echo "::endgroup::"
+
+        echo "::group::Download Test Audio File"
+        AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
+        curl -L $AUDIO_URL -o poem.wav
+        echo "::endgroup::"
+
+        echo "::group::Build Voxtral Runner"
+        cmake --preset llm \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_INSTALL_PREFIX=cmake-out \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Bcmake-out -S.
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
+
+        cmake -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Sexamples/models/voxtral \
+              -Bcmake-out/examples/models/voxtral/
+        cmake --build cmake-out/examples/models/voxtral --target voxtral_runner --config Release
+        echo "::endgroup::"
+
+        echo "::group::Run Voxtral Runner (${{ matrix.format.name }})"
+        set +e
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        OUTPUT=$(cmake-out/examples/models/voxtral/voxtral_runner \
+              --model_path model.pte \
+              --data_path aoti_cuda_blob.ptd \
+              --tokenizer_path tekken.json \
+              --audio_path poem.wav \
+              --processor_path voxtral_preprocessor.pte \
+              --temperature 0 2>&1)
+        EXIT_CODE=$?
+        set -e
+
+        echo "$OUTPUT"
+
+        if ! echo "$OUTPUT" | grep -iq "poem"; then
+          echo "Expected output 'poem' not found in output"
+          exit 1
+        fi
+
+        if [ $EXIT_CODE -ne 0 ]; then
+          echo "Unexpected exit code: $EXIT_CODE"
+          exit $EXIT_CODE
+        fi
+        echo "::endgroup::"
+
+  test-gemma3-cuda-e2e:
+    name: test-gemma3-cuda-e2e-${{ matrix.format.name }}
+    needs: export-gemma3-cuda-artifact
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    strategy:
+      fail-fast: false
+      matrix:
+        format:
+          - name: "non-quantized"
+            artifact: "gemma3-cuda-export"
+          - name: "quantized-int4-tile-packed"
+            artifact: "gemma3-cuda-quantized-int4-tile-packed"
           # TODO: enable int4-weight-only on gemma3.
-          - model:
-              repo: "google"
-              name: "gemma-3-4b-it"
-            quant: "quantized-int4-weight-only"
+          # - name: "quantized-int4-weight-only"
+          #   artifact: "gemma3-cuda-quantized-int4-weight-only"
     with:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
@@ -242,7 +451,61 @@ jobs:
       gpu-arch-version: 12.6
       use-custom-docker-registry: false
       submodules: recursive
-      download-artifact: ${{ matrix.model.repo }}-${{ matrix.model.name }}-cuda-${{ matrix.quant }}
+      download-artifact: ${{ matrix.format.artifact }}
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       script: |
-        source .ci/scripts/test_model_cuda_e2e.sh "${{ matrix.model.repo }}/${{ matrix.model.name }}" "${{ matrix.quant }}" "${RUNNER_ARTIFACT_DIR}"
+        set -eux
+
+        echo "::group::Setup ExecuTorch Requirements"
+        ./install_requirements.sh
+        pip list
+        echo "::endgroup::"
+
+        echo "::group::Prepare Gemma3 Artifacts (${{ matrix.format.name }})"
+        cp "${RUNNER_ARTIFACT_DIR}/model.pte" .
+        cp "${RUNNER_ARTIFACT_DIR}/aoti_cuda_blob.ptd" .
+        TOKENIZER_URL="https://huggingface.co/unsloth/gemma-3-1b-it/resolve/main/tokenizer.json"
+        curl -L $TOKENIZER_URL -o tokenizer.json
+        ls -al model.pte aoti_cuda_blob.ptd tokenizer.json
+        IMAGE_PATH="docs/source/_static/img/et-logo.png"
+        echo "::endgroup::"
+
+        echo "::group::Build Gemma3 Runner"
+        cmake --preset llm \
+              -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_INSTALL_PREFIX=cmake-out \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Bcmake-out -S.
+        cmake --build cmake-out -j$(( $(nproc) - 1 )) --target install --config Release
+
+        cmake -DEXECUTORCH_BUILD_CUDA=ON \
+              -DCMAKE_BUILD_TYPE=Release \
+              -Sexamples/models/gemma3 \
+              -Bcmake-out/examples/models/gemma3/
+        cmake --build cmake-out/examples/models/gemma3 --target gemma3_e2e_runner --config Release
+        echo "::endgroup::"
+
+        echo "::group::Run Gemma3 Runner (${{ matrix.format.name }})"
+        set +e
+        export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
+        OUTPUT=$(cmake-out/examples/models/gemma3/gemma3_e2e_runner \
+              --model_path model.pte \
+              --data_path aoti_cuda_blob.ptd \
+              --tokenizer_path tokenizer.json \
+              --image_path $IMAGE_PATH \
+              --temperature 0 2>&1)
+        EXIT_CODE=$?
+        set -e
+
+        echo "$OUTPUT"
+
+        if ! echo "$OUTPUT" | grep -iq "chip"; then
+          echo "Expected output 'chip' not found in output"
+          exit 1
+        fi
+
+        if [ $EXIT_CODE -ne 0 ]; then
+          echo "Unexpected exit code: $EXIT_CODE"
+          exit $EXIT_CODE
+        fi
+        echo "::endgroup::"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c6d6f26b41f..1b96c12fbf3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -926,11 +926,6 @@ if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
   list(APPEND _executorch_extensions extension_llm_runner)
 endif()
 
-if(EXECUTORCH_BUILD_EXTENSION_ASR_RUNNER)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/asr/runner)
-  list(APPEND _executorch_extensions extension_asr_runner)
-endif()
-
 if(EXECUTORCH_BUILD_EXTENSION_LLM_APPLE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/apple)
 endif()
diff --git a/backends/cuda/tests/multimodal_benchmark.cpp b/backends/cuda/tests/multimodal_benchmark.cpp
index 7365d0b7ba8..679db889b71 100644
--- a/backends/cuda/tests/multimodal_benchmark.cpp
+++ b/backends/cuda/tests/multimodal_benchmark.cpp
@@ -60,8 +60,7 @@ ModelType parse_model_type(const std::string& model_name) {
       lower_name.begin(),
       [](unsigned char c) { return std::tolower(c); });
 
-  if (lower_name.find("gemma3") != std::string::npos ||
-      lower_name.find("gemma-3") != std::string::npos) {
+  if (lower_name.find("gemma3") != std::string::npos) {
     return ModelType::GEMMA3;
   } else if (lower_name.find("voxtral") != std::string::npos) {
     return ModelType::VOXTRAL;
diff --git a/examples/models/whisper/CMakeLists.txt b/examples/models/whisper/CMakeLists.txt
deleted file mode 100644
index 70f5892baa7..00000000000
--- a/examples/models/whisper/CMakeLists.txt
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-cmake_minimum_required(VERSION 3.29)
-project(whisper_runner)
-
-set(CMAKE_CXX_STANDARD 17)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
-
-set(EXECUTORCH_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/../../..")
-include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
-
-# Let files say "include <executorch/path/to/header.h>"
-set(_common_include_directories ${EXECUTORCH_ROOT}/..)
-
-# Need this for gflags for some reason
-set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
-find_package(gflags REQUIRED)
-
-list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
-find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
-
-set(_link_libraries executorch gflags)
-set(_srcs multimodal.cpp)
-
-list(
-  APPEND
-  _link_libraries
-  optimized_native_cpu_ops_lib
-  quantized_ops_lib
-  custom_ops
-  cpublas
-  eigen_blas
-)
-
-# XNNPACK
-if(TARGET xnnpack_backend)
-  list(APPEND _link_libraries xnnpack_backend)
-endif()
-
-# Add LLM runner and extension module
-if(NOT TARGET extension_asr_runner)
-  message(
-    FATAL_ERROR
-      "ExecuTorch must be installed with EXECUTORCH_BUILD_EXTENSION_ASR_RUNNER enabled."
-  )
-endif()
-
-# Needed for cpuinfo where it uses android specific log lib
-if(ANDROID)
-  list(APPEND _link_libraries log)
-endif()
-
-# Add the required ExecuTorch extensions for multimodal LLM runner
-list(
-  APPEND
-  _link_libraries
-  extension_asr_runner
-  extension_llm_runner # Needed for load_tokenizer()
-  extension_module
-  extension_data_loader
-  extension_tensor
-  extension_flat_tensor
-)
-
-# Link CUDA backend
-if(EXECUTORCH_BUILD_CUDA)
-  find_package(CUDAToolkit REQUIRED)
-  list(APPEND _link_libraries aoti_cuda)
-  executorch_target_link_options_shared_lib(aoti_cuda)
-endif()
-
-if(EXECUTORCH_BUILD_METAL)
-  list(APPEND _link_libraries metal_backend)
-  executorch_target_link_options_shared_lib(metal_backend)
-endif()
-
-# Add tokenizers
-list(APPEND _link_libraries tokenizers::tokenizers)
-
-add_executable(whisper_runner main.cpp)
-
-target_include_directories(whisper_runner PUBLIC ${_common_include_directories})
-
-target_link_libraries(whisper_runner PUBLIC ${_link_libraries})
-target_compile_options(whisper_runner PUBLIC ${_common_compile_options})
diff --git a/examples/models/whisper/README.md b/examples/models/whisper/README.md
deleted file mode 100644
index a4025441f7e..00000000000
--- a/examples/models/whisper/README.md
+++ /dev/null
@@ -1,126 +0,0 @@
-# Whisper Runner
-
-This directory hosts a lightweight C++ helper that drives Whisper models
-exported to ExecuTorch. The `AsrRunner` owns the `Module` instance that
-wraps a bundled `.pte` program and optional `.ptd` weight file, loads the
-`encoder` and `text_decoder` methods, and exposes a `transcribe()` loop that
-streams decoded text pieces through a callback.
-
-The runner assumes:
-- `model.pte` contains both Whisper encoder and decoder entry points named
-  `encoder` and `text_decoder`.
-- (Optional) Depending on export configurations, model weights can be optionally stored in a companion
-  `model.ptd`. The runner will load the file if present.
-- A tokenizer JSON compatible with the ExecuTorch tokenizers shim is available.
-
-Audio preprocessing is not part of the runner itself. To transform raw audio
-into the mel features expected by the encoder, reuse the pattern in
-`examples/models/voxtral/multimodal.cpp`, which loads a `preprocessor.pte`
-module to generate the spectrogram tensor.
-
-## Build
-
-Currently we have CUDA build support only. CPU and Metal backend builds are WIP.
-
-```bash
-# Install ExecuTorch libraries:
-cmake --preset llm -DEXECUTORCH_BUILD_CUDA=ON -DCMAKE_INSTALL_PREFIX=cmake-out -DCMAKE_BUILD_TYPE=Release . -Bcmake-out
-cmake --build cmake-out -j$(nproc) --target install --config Release
-
-# Build the runner:
-cmake \
-  -B cmake-out/examples/models/whisper \
-  -S examples/models/whisper
-cmake --build cmake-out/examples/models/whisper -j$(nproc)
-```
-
-The first cmake command build produces a static library named `extension_asr_runner`. The second cmake command links it into your
-application together with the standard ExecuTorch runtime libraries and the
-tokenizer target (`tokenizers::tokenizers`).
-
-## Usage
-
-### Export Whisper Model
-
-Use [Optimum-ExecuTorch](https://github.com/huggingface/optimum-executorch) to export a Whisper model from Hugging Face:
-
-```bash
-optimum-cli export executorch \
-    --model openai/whisper-small \
-    --task automatic-speech-recognition \
-    --recipe cuda \
-    --dtype bfloat16 \
-    --device cuda \
-    --output_dir ./
-```
-
-This command generates:
-- `model.pte` — Compiled Whisper model
-- `aoti_cuda_blob.ptd` — Weight data file for CUDA backend
-
-Export a preprocessor to convert raw audio to mel-spectrograms:
-
-```bash
-python -m executorch.extension.audio.mel_spectrogram \
-    --feature_size 80 \
-    --stack_output \
-    --max_audio_len 300 \
-    --output_file whisper_preprocessor.pte
-```
-
-### Quantization
-
-Export quantized models to reduce size and improve performance:
-
-```bash
-# 4-bit tile packed quantization for encoder
-optimum-cli export executorch \
-    --model openai/whisper-small \
-    --task automatic-speech-recognition \
-    --recipe cuda \
-    --dtype bfloat16 \
-    --device cuda \
-    --qlinear 4w \
-    --qlinear_encoder 4w \
-    --qlinear_packing_format tile_packed_to_4d \
-    --qlinear_encoder_packing_format tile_packed_to_4d \
-    --output_dir ./
-```
-
-
-### Download Tokenizer
-
-Download the tokenizer files required for inference:
-
-```bash
-curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer.json -o tokenizer.json
-curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer_config.json -o tokenizer_config.json
-curl -L https://huggingface.co/openai/whisper-small/resolve/main/special_tokens_map.json -o special_tokens_map.json
-```
-
-### Prepare Audio
-
-Generate test audio or use an existing WAV file. The model expects 16kHz mono audio.
-
-```bash
-# Generate sample audio using librispeech dataset
-python -c "from datasets import load_dataset; import soundfile as sf; sample = load_dataset('distil-whisper/librispeech_long', 'clean', split='validation')[0]['audio']; sf.write('output.wav', sample['array'][:sample['sampling_rate']*30], sample['sampling_rate'])"
-```
-
-### Run Inference
-
-After building the runner (see [Build](#build) section), execute it with the exported model and audio:
-
-```bash
-# Set library path for CUDA dependencies
-export LD_LIBRARY_PATH=/opt/conda/lib:$LD_LIBRARY_PATH
-
-# Run the Whisper runner
-cmake-out/examples/models/whisper/whisper_runner \
-    --model_path model.pte \
-    --data_path aoti_cuda_blob.ptd \
-    --tokenizer_path ./ \
-    --audio_path output.wav \
-    --processor_path whisper_preprocessor.pte \
-    --temperature 0
-```
diff --git a/examples/models/whisper/main.cpp b/examples/models/whisper/main.cpp
deleted file mode 100644
index b4462e2c39a..00000000000
--- a/examples/models/whisper/main.cpp
+++ /dev/null
@@ -1,126 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <cmath>
-#include <cstring>
-#include <fstream>
-#include <iostream>
-#include <limits>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include <gflags/gflags.h>
-
-#include <executorch/extension/asr/runner/runner.h>
-#include <executorch/extension/llm/runner/util.h>
-#include <executorch/extension/llm/runner/wav_loader.h>
-#include <executorch/extension/module/module.h>
-#include <executorch/extension/tensor/tensor_ptr_maker.h>
-#include <executorch/runtime/core/evalue.h>
-#include <executorch/runtime/platform/log.h>
-
-DEFINE_string(model_path, "model.pte", "Path to Whisper model (.pte).");
-DEFINE_string(data_path, "", "Optional path to Whisper weights (.ptd).");
-DEFINE_string(
-    tokenizer_path,
-    ".",
-    "Path to tokenizer directory containing tokenizer.json, tokenizer_config.json, and special_tokens_map.json.");
-DEFINE_string(
-    processor_path,
-    "",
-    "Path to preprocessor .pte for converting raw audio.");
-DEFINE_string(
-    audio_path,
-    "",
-    "Path to input audio file. Accepts .wav or raw float .bin.");
-DEFINE_double(
-    temperature,
-    0.0,
-    "Sampling temperature. 0.0 performs greedy decoding.");
-DEFINE_int32(max_new_tokens, 128, "Maximum number of tokens to generate.");
-
-using ::executorch::extension::from_blob;
-using ::executorch::extension::Module;
-
-int main(int argc, char** argv) {
-  gflags::ParseCommandLineFlags(&argc, &argv, true);
-  ::executorch::extension::TensorPtr features;
-  std::vector<float> audio_data;
-  std::unique_ptr<Module> processor;
-
-  if (FLAGS_audio_path.empty()) {
-    ET_LOG(Error, "audio_path flag must be provided.");
-    return 1;
-  }
-
-  audio_data =
-      executorch::extension::llm::load_wav_audio_data(FLAGS_audio_path);
-  ET_LOG(
-      Info,
-      "First 2 values of audio data: %f, %f",
-      audio_data[0],
-      audio_data[1]);
-
-  processor =
-      std::make_unique<Module>(FLAGS_processor_path, Module::LoadMode::Mmap);
-  auto load_error = processor->load();
-  if (load_error != ::executorch::runtime::Error::Ok) {
-    ET_LOG(Error, "Failed to load preprocessor module.");
-    return 1;
-  }
-
-  auto audio_tensor = from_blob(
-      audio_data.data(),
-      {static_cast<::executorch::aten::SizesType>(audio_data.size())},
-      ::executorch::aten::ScalarType::Float);
-
-  auto processed_result = processor->execute("forward", audio_tensor);
-  if (processed_result.error() != ::executorch::runtime::Error::Ok) {
-    ET_LOG(Error, "Audio preprocessing failed.");
-    return 1;
-  }
-  auto outputs = std::move(processed_result.get());
-  if (outputs.empty() || !outputs[0].isTensor()) {
-    ET_LOG(Error, "Preprocessor returned unexpected outputs.");
-    return 1;
-  }
-  auto tensor = outputs[0].toTensor();
-  ET_LOG(
-      Info,
-      "Result scalar_type: %s, first value %f",
-      ::executorch::runtime::toString(tensor.scalar_type()),
-      tensor.mutable_data_ptr<float>()[0]);
-  features = std::make_shared<::executorch::aten::Tensor>(std::move(tensor));
-
-  executorch::extension::asr::AsrRunner runner(
-      FLAGS_model_path, FLAGS_data_path, FLAGS_tokenizer_path);
-  auto load_err = runner.load();
-  if (load_err != ::executorch::runtime::Error::Ok) {
-    ET_LOG(Error, "Failed to load Whisper model.");
-    return 1;
-  }
-
-  executorch::extension::asr::AsrTranscribeConfig config;
-  config.max_new_tokens = FLAGS_max_new_tokens;
-  config.temperature = static_cast<float>(FLAGS_temperature);
-  config.decoder_start_token_id = 50257;
-
-  auto result =
-      runner.transcribe(features, config, [&](const std::string& piece) {
-        ::executorch::extension::llm::safe_printf(piece.c_str());
-        fflush(stdout);
-      });
-
-  if (!result.ok()) {
-    ET_LOG(Error, "Transcription failed.");
-    return 1;
-  }
-
-  return 0;
-}
diff --git a/extension/asr/runner/CMakeLists.txt b/extension/asr/runner/CMakeLists.txt
deleted file mode 100644
index cc9ba01596a..00000000000
--- a/extension/asr/runner/CMakeLists.txt
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-#
-# ASR runner for models like Whisper
-#
-# ### Editing this file ###
-#
-# This file should be formatted with
-# ~~~
-# cmake-format -i CMakeLists.txt
-# ~~~
-# It should also be cmake-lint clean.
-#
-
-if(NOT EXECUTORCH_ROOT)
-  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
-endif()
-
-include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
-
-set(runner_deps executorch_core extension_module extension_tensor
-                tokenizers::tokenizers
-)
-
-# Define runner library
-add_library(extension_asr_runner STATIC runner.cpp)
-target_include_directories(
-  extension_asr_runner INTERFACE ${_common_include_directories}
-)
-target_link_libraries(extension_asr_runner PUBLIC ${runner_deps})
-set_target_properties(
-  extension_asr_runner PROPERTIES POSITION_INDEPENDENT_CODE ON
-)
-
-install(
-  TARGETS extension_asr_runner
-  EXPORT ExecuTorchTargets
-  DESTINATION ${CMAKE_INSTALL_LIBDIR}
-  INCLUDES
-  DESTINATION ${_common_include_directories}
-)
-
-install(
-  DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/
-  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/executorch/extension/asr/runner
-  FILES_MATCHING
-  PATTERN "*.h"
-)
diff --git a/extension/asr/runner/runner.cpp b/extension/asr/runner/runner.cpp
deleted file mode 100644
index 6bbb44e4faa..00000000000
--- a/extension/asr/runner/runner.cpp
+++ /dev/null
@@ -1,321 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#include <executorch/extension/asr/runner/runner.h>
-
-#include <inttypes.h>
-#include <algorithm>
-#include <optional>
-
-#include <executorch/extension/llm/runner/constants.h>
-#include <executorch/extension/llm/runner/util.h>
-#include <executorch/extension/llm/sampler/util.h>
-#include <executorch/extension/tensor/tensor_ptr_maker.h>
-#include <executorch/runtime/core/evalue.h>
-#include <executorch/runtime/platform/assert.h>
-#include <executorch/runtime/platform/log.h>
-
-namespace executorch::extension::asr {
-namespace {
-
-constexpr const char* kEncoderMethodName = "encoder";
-constexpr const char* kDecoderMethodName = "text_decoder";
-
-} // namespace
-
-AsrRunner::AsrRunner(
-    const std::string& module_path,
-    std::optional<std::string> data_path,
-    const std::string& tokenizer_path)
-    : module_path_(module_path),
-      data_path_(data_path.value_or("")),
-      tokenizer_path_(tokenizer_path) {
-  if (data_path_.empty()) {
-    module_ = std::make_unique<Module>(module_path_, Module::LoadMode::Mmap);
-  } else {
-    module_ = std::make_unique<Module>(
-        module_path_, data_path_, Module::LoadMode::Mmap);
-  }
-}
-
-bool AsrRunner::is_loaded() const {
-  return module_ && encoder_method_loaded_ && decoder_method_loaded_ &&
-      tokenizer_ && tokenizer_->is_loaded() && !eos_token_ids_.empty();
-}
-
-Error AsrRunner::load_tokenizer() {
-  if (tokenizer_ && tokenizer_->is_loaded()) {
-    return Error::Ok;
-  }
-
-  auto tokenizer =
-      ::executorch::extension::llm::load_tokenizer(tokenizer_path_);
-  ET_CHECK_OR_RETURN_ERROR(
-      tokenizer,
-      Internal,
-      "Failed to create tokenizer from %s",
-      tokenizer_path_.c_str());
-
-  tokenizer_ = std::move(tokenizer);
-  if (!tokenizer_->is_loaded()) {
-    ET_LOG(
-        Error,
-        "Tokenizer reported unloaded state after load: %s",
-        tokenizer_path_.c_str());
-    return Error::Internal;
-  }
-
-  eos_token_ids_.clear();
-  eos_token_ids_.insert(static_cast<int64_t>(tokenizer_->eos_tok()));
-  return Error::Ok;
-}
-
-Error AsrRunner::load() {
-  if (is_loaded()) {
-    return Error::Ok;
-  }
-
-  stats_.model_load_start_ms = ::executorch::extension::llm::time_in_ms();
-
-  ET_CHECK_OR_RETURN_ERROR(
-      module_ != nullptr,
-      InvalidArgument,
-      "Module handle is null for path %s",
-      module_path_.c_str());
-
-  ET_CHECK_OK_OR_RETURN_ERROR(module_->load());
-
-  auto method_names_result = module_->method_names();
-  ET_CHECK_OK_OR_RETURN_ERROR(method_names_result.error());
-  const auto& method_names = method_names_result.get();
-
-  ET_CHECK_OR_RETURN_ERROR(
-      method_names.count(kEncoderMethodName) &&
-          method_names.count(kDecoderMethodName),
-      InvalidArgument,
-      "Required methods not found. encoder=%d, text_decoder=%d",
-      static_cast<int>(method_names.count(kEncoderMethodName)),
-      static_cast<int>(method_names.count(kDecoderMethodName)));
-
-  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kEncoderMethodName));
-  encoder_method_loaded_ = true;
-
-  ET_CHECK_OK_OR_RETURN_ERROR(module_->load_method(kDecoderMethodName));
-  decoder_method_loaded_ = true;
-
-  ET_CHECK_OK_OR_RETURN_ERROR(load_tokenizer());
-  auto eos_ids = get_eos_ids(tokenizer_.get(), module_.get());
-  if (!eos_ids.empty()) {
-    eos_token_ids_.clear();
-    for (uint64_t eos_id : eos_ids) {
-      eos_token_ids_.insert(static_cast<int64_t>(eos_id));
-    }
-  }
-
-  stats_.model_load_end_ms = ::executorch::extension::llm::time_in_ms();
-
-  return Error::Ok;
-}
-
-Result<std::vector<int64_t>> AsrRunner::transcribe(
-    ::executorch::extension::TensorPtr preprocessed_features,
-    AsrTranscribeConfig config,
-    std::function<void(const std::string&)> token_callback) {
-  ET_CHECK_OR_RETURN_ERROR(
-      config.max_new_tokens > 0,
-      InvalidArgument,
-      "max_new_tokens must be positive, got %" PRId64,
-      config.max_new_tokens);
-
-  ET_LOG(
-      Info,
-      "Preprocessed features shape: [%zu, %zu, %zu]",
-      static_cast<size_t>(preprocessed_features->size(0)),
-      static_cast<size_t>(preprocessed_features->size(1)),
-      static_cast<size_t>(preprocessed_features->size(2)));
-
-  if (!is_loaded()) {
-    ET_CHECK_OK_OR_RETURN_ERROR(load());
-  }
-
-  ET_LOG(
-      Info,
-      "RSS after loading model: %f MiB (0 if unsupported)",
-      ::executorch::extension::llm::get_rss_bytes() / 1024.0 / 1024.0);
-
-  // Reset internal state and start inference
-  stats_.inference_start_ms = ::executorch::extension::llm::time_in_ms();
-
-  const std::unordered_set<int64_t>* eos_tokens = &eos_token_ids();
-  if (!config.eos_token_ids.empty()) {
-    eos_tokens = &config.eos_token_ids;
-  }
-  ET_CHECK_OR_RETURN_ERROR(
-      !eos_tokens->empty(),
-      InvalidArgument,
-      "EOS token set must not be empty.");
-  ::executorch::extension::llm::Sampler sampler(
-      tokenizer_->vocab_size(), config.temperature);
-
-  // Check expected dtype for encoder input
-  auto encoder_method_meta_result = module_->method_meta(kEncoderMethodName);
-  ET_CHECK_OK_OR_RETURN_ERROR(encoder_method_meta_result.error());
-  auto encoder_method_meta = encoder_method_meta_result.get();
-
-  ::executorch::aten::ScalarType expected_dtype =
-      ::executorch::aten::ScalarType::Float;
-  if (encoder_method_meta.num_inputs() > 0) {
-    auto input_meta_result = encoder_method_meta.input_tensor_meta(0);
-    if (input_meta_result.error() == ::executorch::runtime::Error::Ok) {
-      expected_dtype = input_meta_result.get().scalar_type();
-    }
-  }
-
-  // Convert preprocessed_features to expected dtype if needed
-  if (preprocessed_features->scalar_type() != expected_dtype) {
-    if (expected_dtype == ::executorch::aten::ScalarType::BFloat16) {
-      ET_LOG(
-          Info,
-          "Converting audio features from %s to BFloat16. Before converting, first value = %f",
-          ::executorch::runtime::toString(preprocessed_features->scalar_type()),
-          preprocessed_features->mutable_data_ptr<float>()[0]);
-      auto convert_result = ::executorch::extension::llm::convert_to_bfloat16(
-          preprocessed_features);
-      ET_CHECK_OK_OR_RETURN_ERROR(convert_result.error());
-      preprocessed_features = convert_result.get();
-      ET_LOG(
-          Info,
-          "Conversion complete, first value = %f",
-          static_cast<float>(
-              preprocessed_features
-                  ->mutable_data_ptr<::executorch::aten::BFloat16>()[0]));
-    }
-  }
-
-  auto encoder_result =
-      module_->execute(kEncoderMethodName, preprocessed_features);
-  ET_CHECK_OK_OR_RETURN_ERROR(encoder_result.error());
-
-  stats_.prompt_eval_end_ms = ::executorch::extension::llm::time_in_ms();
-  stats_.num_prompt_tokens = 0;
-
-  auto encoder_outputs = std::move(*encoder_result);
-  ET_CHECK_OR_RETURN_ERROR(
-      encoder_outputs.size() == 1 && encoder_outputs[0].isTensor(),
-      Internal,
-      "Encoder returned %zu outputs; expected a single tensor.",
-      encoder_outputs.size());
-
-  ::executorch::aten::Tensor encoder_output_tensor =
-      std::move(encoder_outputs[0]).toTensor();
-
-  ET_LOG(
-      Info,
-      "Encoder output shape: [%zu, %zu, %zu]",
-      static_cast<size_t>(encoder_output_tensor.size(0)),
-      static_cast<size_t>(encoder_output_tensor.size(1)),
-      static_cast<size_t>(encoder_output_tensor.size(2)));
-  ET_LOG(
-      Info,
-      "Encoder first value: %f",
-      static_cast<float>(
-          encoder_output_tensor
-              .mutable_data_ptr<::executorch::aten::BFloat16>()[0]));
-
-  auto encoder_output_ptr = std::make_shared<::executorch::aten::Tensor>(
-      std::move(encoder_output_tensor));
-
-  std::vector<int64_t> tokens = {config.decoder_start_token_id};
-
-  int64_t input_id = config.decoder_start_token_id;
-  int64_t cache_position = 0;
-  int64_t generated_tokens = 0;
-  bool first_token_generated = false;
-  auto decoder_input_ptr = ::executorch::extension::from_blob(
-      &input_id,
-      {static_cast<::executorch::aten::SizesType>(1),
-       static_cast<::executorch::aten::SizesType>(1)},
-      ::executorch::aten::ScalarType::Long);
-
-  auto cache_position_ptr = ::executorch::extension::from_blob(
-      &cache_position,
-      {static_cast<::executorch::aten::SizesType>(1)},
-      ::executorch::aten::ScalarType::Long);
-
-  std::vector<::executorch::runtime::EValue> decoder_inputs;
-  decoder_inputs.reserve(3);
-  decoder_inputs.emplace_back(decoder_input_ptr);
-  decoder_inputs.emplace_back(encoder_output_ptr);
-  decoder_inputs.emplace_back(cache_position_ptr);
-  // Add some green coloring for the first generated token
-  // token_callback("\033[1;32m");
-  while (generated_tokens < config.max_new_tokens) {
-    input_id = tokens.back();
-    auto decoder_result = module_->execute(kDecoderMethodName, decoder_inputs);
-    ET_CHECK_OK_OR_RETURN_ERROR(decoder_result.error());
-
-    auto decoder_outputs = std::move(*decoder_result);
-    ET_CHECK_OR_RETURN_ERROR(
-        decoder_outputs.size() == 1 && decoder_outputs[0].isTensor(),
-        Internal,
-        "Decoder returned %zu outputs; expected a single tensor.",
-        decoder_outputs.size());
-
-    ::executorch::aten::Tensor logits_tensor =
-        std::move(decoder_outputs[0]).toTensor();
-    const int64_t vocab_size = logits_tensor.numel();
-    ET_CHECK_OR_RETURN_ERROR(
-        vocab_size > 0, Internal, "Decoder logits tensor is empty.");
-
-    const int64_t next_token =
-        static_cast<int64_t>(::executorch::extension::llm::logits_to_token(
-            logits_tensor, config.temperature));
-
-    if (!first_token_generated) {
-      stats_.first_token_ms = ::executorch::extension::llm::time_in_ms();
-      first_token_generated = true;
-    }
-
-    const int64_t prev_token = input_id;
-    tokens.push_back(next_token);
-    ++generated_tokens;
-    ++cache_position;
-    input_id = next_token;
-
-    if (token_callback) {
-      auto piece_result = tokenizer_->decode(
-          static_cast<uint64_t>(prev_token), static_cast<uint64_t>(next_token));
-      if (piece_result.ok()) {
-        token_callback(piece_result.get());
-      } else {
-        ET_LOG(
-            Error,
-            "Tokenizer failed to decode token pair (%" PRId64 ", %" PRId64
-            ") with error %d",
-            prev_token,
-            next_token,
-            static_cast<int>(piece_result.error()));
-      }
-    }
-
-    if (eos_tokens->count(next_token) > 0) {
-      break;
-    }
-  }
-  // Reset coloring
-  // token_callback("\033[0m");
-  // Update stats and print report
-  stats_.num_generated_tokens = generated_tokens;
-  stats_.inference_end_ms = ::executorch::extension::llm::time_in_ms();
-  printf("\n");
-  print_report(stats_);
-
-  return tokens;
-}
-
-} // namespace executorch::extension::asr
diff --git a/extension/asr/runner/runner.h b/extension/asr/runner/runner.h
deleted file mode 100644
index a9f8ce3edda..00000000000
--- a/extension/asr/runner/runner.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-#include <executorch/extension/llm/runner/llm_runner_helper.h>
-#include <executorch/extension/llm/runner/stats.h>
-#include <executorch/extension/llm/sampler/sampler.h>
-#include <executorch/extension/module/module.h>
-#include <executorch/extension/tensor/tensor_ptr.h>
-#include <executorch/runtime/core/error.h>
-#include <executorch/runtime/core/result.h>
-#include <pytorch/tokenizers/tokenizer.h>
-
-namespace executorch::extension::asr {
-
-using ::executorch::extension::Module;
-using ::executorch::extension::llm::get_eos_ids;
-using ::executorch::extension::llm::load_tokenizer;
-using ::executorch::extension::llm::print_report;
-using ::executorch::extension::llm::Sampler;
-using ::executorch::extension::llm::Stats;
-using ::executorch::runtime::Error;
-using ::executorch::runtime::Result;
-
-/**
- * Configuration for the ASR transcription loop.
- *
- * max_new_tokens controls the number of tokens generated after the prompt.
- * Temperature controls the randomness of the output.
- */
-struct ET_EXPERIMENTAL AsrTranscribeConfig {
-  int64_t max_new_tokens = 128;
-  std::unordered_set<int64_t> eos_token_ids = {};
-  float temperature = 0.0f;
-  int64_t decoder_start_token_id = 0;
-};
-
-/**
- * Runner that owns a ASR model encoder + decoder pair exported as a single
- * ExecuTorch module. A good example is Whisper
- * (https://huggingface.co/openai/whisper-small)
- *
- * The module is expected to expose two callable methods:
- *  - "encoder": processes precomputed audio features into encoder states.
- *  - "text_decoder": consumes the decoder input ids, encoder output and cache
- *    positions to autoregressively generate logits.
- */
-class ET_EXPERIMENTAL AsrRunner {
- public:
-  AsrRunner(
-      const std::string& module_path,
-      std::optional<std::string> data_path,
-      const std::string& tokenizer_path);
-
-  /**
-   * Returns true when the module and tokenizer are ready for inference.
-   */
-  bool is_loaded() const;
-
-  /**
-   * Loads the module, validates required methods and initialises tokenizer.
-   */
-  ::executorch::runtime::Error load();
-
-  /**
-   * Executes an end-to-end transcription cycle.
-   *
-   * @param preprocessed_features Audio features tensor of shape [batch, time,
-   * features] already processed by a preprocessor module. Typically produced
-   * by an audio feature extractor (e.g., mel-spectrogram computation).
-   * @param config Controls generation length and termination criteria.
-   * @param token_callback Optional functor invoked for each decoded piece of
-   * text emitted during generation.
-   *
-   * @returns Result containing the final decoder token ids (including the seed
-   * prompt and generated tokens), or an error.
-   */
-  ::executorch::runtime::Result<std::vector<int64_t>> transcribe(
-      ::executorch::extension::TensorPtr preprocessed_features,
-      AsrTranscribeConfig config = {},
-      std::function<void(const std::string&)> token_callback = {});
-
- private:
-  ::executorch::runtime::Error load_tokenizer();
-  inline const std::unordered_set<int64_t>& eos_token_ids() const {
-    return eos_token_ids_;
-  }
-
-  std::string module_path_;
-  std::string data_path_;
-  std::string tokenizer_path_;
-
-  std::unique_ptr<Module> module_;
-  std::unique_ptr<::tokenizers::Tokenizer> tokenizer_;
-  std::unordered_set<int64_t> eos_token_ids_;
-
-  bool encoder_method_loaded_ = false;
-  bool decoder_method_loaded_ = false;
-
-  Stats stats_;
-};
-
-} // namespace executorch::extension::asr
diff --git a/extension/llm/runner/text_decoder_runner.h b/extension/llm/runner/text_decoder_runner.h
index 720000185c9..9a090de50d6 100644
--- a/extension/llm/runner/text_decoder_runner.h
+++ b/extension/llm/runner/text_decoder_runner.h
@@ -12,7 +12,6 @@
 
 #include <executorch/extension/llm/runner/io_manager/io_manager.h>
 #include <executorch/extension/llm/sampler/sampler.h>
-#include <executorch/extension/llm/sampler/util.h>
 
 namespace executorch {
 namespace extension {
@@ -65,8 +64,39 @@ class ET_EXPERIMENTAL TextDecoderRunner {
   inline int32_t logits_to_token(
       const executorch::aten::Tensor& logits_tensor,
       const float temperature = 0.0f) {
-    return ::executorch::extension::llm::logits_to_token(
-        logits_tensor, temperature);
+    int32_t result = 0;
+
+    // Create a minimal context for error handling in ET_SWITCH
+    struct {
+      [[noreturn]] void fail(torch::executor::Error /* error */) {
+        ET_CHECK_MSG(false, "Unsupported dtype in logits_to_token");
+      }
+    } ctx;
+
+    ET_SWITCH_FOUR_TYPES(
+        Float,
+        Half,
+        BFloat16,
+        UInt16,
+        logits_tensor.scalar_type(),
+        ctx,
+        "logits_to_token",
+        CTYPE,
+        [&]() {
+          // If the logit_tensor rank is 3, the shape is [batch, seq_length,
+          // vocab_size], get the last logits, sample and return. Else the model
+          // outputs the last logit, directly sample and return.
+          auto* logits = logits_tensor.mutable_data_ptr<CTYPE>();
+          ssize_t vocab_size = logits_tensor.size(logits_tensor.dim() - 1);
+          if (logits_tensor.dim() == 3) {
+            auto num_tokens = logits_tensor.size(1);
+            logits += (num_tokens - 1) * vocab_size;
+          }
+          // @lint-ignore CLANGTIDY facebook-hte-Deprecated
+          Sampler sampler(vocab_size, temperature);
+          result = sampler.sample(logits);
+        });
+    return result;
   }
 
  protected:
diff --git a/extension/llm/sampler/targets.bzl b/extension/llm/sampler/targets.bzl
index b76bfcd6133..9b7751c19e7 100644
--- a/extension/llm/sampler/targets.bzl
+++ b/extension/llm/sampler/targets.bzl
@@ -8,7 +8,6 @@ def define_common_targets():
             name = "sampler" + aten_suffix,
             exported_headers = [
                 "sampler.h",
-                "util.h",
             ],
             preprocessor_flags = [
                 "-DUSE_ATEN_LIB",
diff --git a/extension/llm/sampler/util.h b/extension/llm/sampler/util.h
deleted file mode 100644
index 6a3a06355ca..00000000000
--- a/extension/llm/sampler/util.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) Meta Platforms, Inc. and affiliates.
- * All rights reserved.
- *
- * This source code is licensed under the BSD-style license found in the
- * LICENSE file in the root directory of this source tree.
- */
-
-#pragma once
-
-#include <executorch/extension/llm/sampler/sampler.h>
-#include <executorch/runtime/core/exec_aten/exec_aten.h>
-
-namespace executorch {
-namespace extension {
-namespace llm {
-
-/**
- * Sample the next token from the logits tensor.
- * @param logits_tensor The logits tensor.
- * @param temperature The temperature parameter used to control randomness in
- * sampling.
- * @return The next token.
- */
-inline int32_t logits_to_token(
-    const executorch::aten::Tensor& logits_tensor,
-    const float temperature = 0.0f) {
-  int32_t result = 0;
-
-  // Create a minimal context for error handling in ET_SWITCH
-  struct {
-    [[noreturn]] void fail(torch::executor::Error /* error */) {
-      ET_CHECK_MSG(false, "Unsupported dtype in logits_to_token");
-    }
-  } ctx;
-
-  ET_SWITCH_FOUR_TYPES(
-      Float,
-      Half,
-      BFloat16,
-      UInt16,
-      logits_tensor.scalar_type(),
-      ctx,
-      "logits_to_token",
-      CTYPE,
-      [&]() {
-        // If the logit_tensor rank is 3, the shape is [batch, seq_length,
-        // vocab_size], get the last logits, sample and return. Else the model
-        // outputs the last logit, directly sample and return.
-        auto* logits = logits_tensor.mutable_data_ptr<CTYPE>();
-        ssize_t vocab_size = logits_tensor.size(logits_tensor.dim() - 1);
-        if (logits_tensor.dim() == 3) {
-          auto num_tokens = logits_tensor.size(1);
-          logits += (num_tokens - 1) * vocab_size;
-        }
-        // @lint-ignore CLANGTIDY facebook-hte-Deprecated
-        Sampler sampler(vocab_size, temperature);
-        result = sampler.sample(logits);
-      });
-  return result;
-}
-
-} // namespace llm
-} // namespace extension
-} // namespace executorch
diff --git a/tools/cmake/executorch-config.cmake b/tools/cmake/executorch-config.cmake
index d6f8ded668b..3f97db77ccc 100644
--- a/tools/cmake/executorch-config.cmake
+++ b/tools/cmake/executorch-config.cmake
@@ -68,7 +68,6 @@ set(optional_lib_list
     qnn_executorch_backend
     portable_ops_lib
     custom_ops
-    extension_asr_runner
     extension_evalue_util
     extension_llm_runner
     extension_module
diff --git a/tools/cmake/preset/default.cmake b/tools/cmake/preset/default.cmake
index 0dcec0df531..861e41e4a63 100644
--- a/tools/cmake/preset/default.cmake
+++ b/tools/cmake/preset/default.cmake
@@ -65,10 +65,6 @@ define_overridable_option(
   EXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT
   "Build the optimized ops library for AOT export usage" BOOL OFF
 )
-define_overridable_option(
-  EXECUTORCH_BUILD_EXTENSION_ASR_RUNNER "Build the ASR runner extension" BOOL
-  OFF
-)
 define_overridable_option(
   EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "Build the Data Loader extension" BOOL
   ON # Required by executor_runner
diff --git a/tools/cmake/preset/llm.cmake b/tools/cmake/preset/llm.cmake
index f3d3ab8ef8f..6cd2482f717 100644
--- a/tools/cmake/preset/llm.cmake
+++ b/tools/cmake/preset/llm.cmake
@@ -5,7 +5,6 @@
 # LICENSE file in the root directory of this source tree.
 
 # keep sorted
-set_overridable_option(EXECUTORCH_BUILD_EXTENSION_ASR_RUNNER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
 set_overridable_option(EXECUTORCH_BUILD_EXTENSION_LLM ON)