Update CI for HF Optimum models

Guang Yang · Guang Yang · commit fc92a31cb279 · 2025-05-12T15:18:51.000-07:00
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -555,11 +555,11 @@ jobs:
     strategy:
       matrix:
         hf_model_id: [
-          google/gemma-2-2b,
-          Qwen/Qwen2.5-0.5B,
+          google/gemma-3-1b-it,
+          Qwen/Qwen3-0.6B,
           HuggingFaceTB/SmolLM2-135M,
           meta-llama/Llama-3.2-1B,
-          allenai/OLMo-1B-hf
+          allenai/OLMo-1B-hf,
         ]
       fail-fast: false
     with:
@@ -575,38 +575,93 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake
+        # Build executor_runner with ETdump enabled
+        PYTHON_EXECUTABLE=python cmake -DPYTHON_EXECUTABLE=python \
+          -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DEXECUTORCH_ENABLE_LOGGING=1 \
+          -DCMAKE_BUILD_TYPE=Release \
+          -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
+          -DEXECUTORCH_BUILD_XNNPACK=ON \
+          -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
+          -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+          -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+          -DEXECUTORCH_BUILD_DEVTOOLS=ON \
+          -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
+          -Bcmake-out .
+        cmake --build cmake-out -j16 --target install --config Release
         echo "::endgroup::"
 
         echo "::group::Set up Hugging Face"
         pip install -U "huggingface_hub[cli]"
         huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
         git clone https://github.com/huggingface/optimum-executorch
-        cd optimum-executorch
+        pushd optimum-executorch
         # There is no release yet, for CI stability, always test from the same commit on main
-        git checkout 577a2b19670e4c643a5c6ecb09bf47b9a699e7c6
+        git checkout da80c9e35b3db5c7eea8731b7d660482fb4870a8
         pip install .[tests]
+        popd
+
+        if [ "${{ matrix.hf_model_id }}" == "google/gemma-3-1b-it" ]; then
+          # Fixes for gemma-3 is not available in the released version
+          git clone https://github.com/huggingface/transformers.git
+          pushd transformers
+          git checkout a57274466f7f72efaa2662d1738cdaf28ae8071f
+          pip install -e .
+          popd
+        fi
         pip list
         echo "::endgroup::"
 
-        echo "::group::Export and Run ${{ matrix.hf_model_id }}"
+        echo "::group::Export to ExecuTorch"
         # Pass matrix variable as environment variable
         export MODEL_ID="${{ matrix.hf_model_id }}"
+        export OUTPUT_DIR="${MODEL_ID}_custom_sdpa_8da4w"
+        pushd optimum-executorch
+
+        optimum-cli export executorch \
+          --model ${MODEL_ID} \
+          --task text-generation \
+          --recipe xnnpack \
+          --use_custom_sdpa \
+          --output_dir ${OUTPUT_DIR} \
+          --qlinear
+
+        ls -FlAGhp ${OUTPUT_DIR}
+        popd
+        echo "::endgroup::"
+
+        echo "::group::Inference using python API"
+        pushd optimum-executorch
         python -c "
         import os
         from optimum.executorch import ExecuTorchModelForCausalLM
         from transformers import AutoTokenizer
 
         model_id = os.getenv('MODEL_ID')
-        print(f'Loading model: {model_id}')
-        model = ExecuTorchModelForCausalLM.from_pretrained(model_id, recipe='xnnpack')
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        pte_dir = os.getenv('OUTPUT_DIR')
+        print(f'Loading model {model_id} from {pte_dir}.')
+        model = ExecuTorchModelForCausalLM.from_pretrained(pte_dir)
         generated_text = model.text_generation(
-          tokenizer=tokenizer,
+          tokenizer=AutoTokenizer.from_pretrained(model_id),
           prompt='Simply put, the theory of relativity states that',
           max_seq_len=64
         )
         print(generated_text)
         "
+        popd
+        echo "::endgroup::"
+
+        echo "::group::Inference using executor_runner with ETDump"
+        pushd executorch
+        ./cmake-out/executor_runner \
+          --model_path ${OUTPUT_DIR}/model.pte \
+          --etdump_path ${OUTPUT_DIR}/etdump.etdp
+
+        python3 -m devtools.inspector.inspector_cli \n
+          --etdump_path ${OUTPUT_DIR}/etdump.etdp
+        popd
         echo "::endgroup::"