diff --git a/.ci/scripts/test_torchao_huggingface_checkpoints.sh b/.ci/scripts/test_torchao_huggingface_checkpoints.sh new file mode 100644 index 00000000000..c0910b47826 --- /dev/null +++ b/.ci/scripts/test_torchao_huggingface_checkpoints.sh @@ -0,0 +1,139 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ------------------------- +# Args / flags +# ------------------------- +TEST_WITH_RUNNER=0 +MODEL_NAME="" + +# Parse args +if [[ $# -lt 1 ]]; then + echo "Usage: $0 [--test_with_runner]" + echo "Supported model_name values: qwen3_4b, phi_4_mini" + exit 1 +fi + +MODEL_NAME="$1" +shift + +while [[ $# -gt 0 ]]; do + case "$1" in + --test_with_runner) + TEST_WITH_RUNNER=1 + ;; + -h|--help) + echo "Usage: $0 [--test_with_runner]" + echo " model_name: qwen3_4b | phi_4_mini" + echo " --test_with_runner: build ET + run llama_main to sanity-check the export" + exit 0 + ;; + *) + echo "Unknown option: $1" + exit 1 + ;; + esac + shift +done + +if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then + PYTHON_EXECUTABLE=python3 +fi + +MODEL_OUT=model.pte + +case "$MODEL_NAME" in + qwen3_4b) + echo "Running Qwen3-4B export..." + HF_MODEL_DIR=$(hf download pytorch/Qwen3-4B-INT8-INT4) + EXPECTED_MODEL_SIZE_UPPER_BOUND=$((3 * 1024 * 1024 * 1024)) # 3GB + $PYTHON_EXECUTABLE -m executorch.examples.models.qwen3.convert_weights \ + $HF_MODEL_DIR \ + pytorch_model_converted.bin + + $PYTHON_EXECUTABLE -m executorch.examples.models.llama.export_llama \ + --model "qwen3_4b" \ + --checkpoint pytorch_model_converted.bin \ + --params examples/models/qwen3/config/4b_config.json \ + --output_name $MODEL_OUT \ + -kv \ + --use_sdpa_with_kv_cache \ + -X \ + --xnnpack-extended-ops \ + --max_context_length 1024 \ + --max_seq_length 1024 \ + --dtype fp32 \ + --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}' + ;; + + phi_4_mini) + echo "Running Phi-4-mini export..." + HF_MODEL_DIR=$(hf download pytorch/Phi-4-mini-instruct-INT8-INT4) + EXPECTED_MODEL_SIZE_UPPER_BOUND=$((3 * 1024 * 1024 * 1024)) # 3GB + $PYTHON_EXECUTABLE -m executorch.examples.models.phi_4_mini.convert_weights \ + $HF_MODEL_DIR \ + pytorch_model_converted.bin + + $PYTHON_EXECUTABLE -m executorch.examples.models.llama.export_llama \ + --model "phi_4_mini" \ + --checkpoint pytorch_model_converted.bin \ + --params examples/models/phi_4_mini/config/config.json \ + --output_name $MODEL_OUT \ + -kv \ + --use_sdpa_with_kv_cache \ + -X \ + --xnnpack-extended-ops \ + --max_context_length 1024 \ + --max_seq_length 1024 \ + --dtype fp32 \ + --metadata '{"get_bos_id":199999, "get_eos_ids":[200020,199999]}' + ;; + + *) + echo "Error: unsupported model_name '$MODEL_NAME'" + echo "Supported values: qwen3_4b, phi_4_mini" + exit 1 + ;; +esac + +# Check file size +MODEL_SIZE=$(stat --printf="%s" $MODEL_OUT 2>/dev/null || stat -f%z $MODEL_OUT) +if [[ $MODEL_SIZE -gt $EXPECTED_MODEL_SIZE_UPPER_BOUND ]]; then + echo "Error: model size $MODEL_SIZE is greater than expected upper bound $EXPECTED_MODEL_SIZE_UPPER_BOUND" + exit 1 +fi + +# Install ET with CMake +if [[ "$TEST_WITH_RUNNER" -eq 1 ]]; then + echo "[runner] Building and testing llama_main ..." + cmake -DPYTHON_EXECUTABLE=python \ + -DCMAKE_INSTALL_PREFIX=cmake-out \ + -DEXECUTORCH_ENABLE_LOGGING=1 \ + -DCMAKE_BUILD_TYPE=Release \ + -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \ + -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ + -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ + -DEXECUTORCH_BUILD_XNNPACK=ON \ + -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ + -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \ + -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \ + -DEXECUTORCH_BUILD_KERNELS_LLM=ON \ + -Bcmake-out . + cmake --build cmake-out -j16 --config Release --target install + + + # Install llama runner + cmake -DPYTHON_EXECUTABLE=python \ + -DCMAKE_BUILD_TYPE=Release \ + -Bcmake-out/examples/models/llama \ + examples/models/llama + cmake --build cmake-out/examples/models/llama -j16 --config Release + + # Run the model + ./cmake-out/examples/models/llama/llama_main --model_path=$MODEL_OUT --tokenizer_path="${HF_MODEL_DIR}/tokenizer.json" --prompt="Once upon a time," +fi + +# Clean up +rm -f pytorch_model_converted.bin "$MODEL_OUT" diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 251bb238f1b..f5c5161e0cc 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -585,6 +585,37 @@ jobs: # Test llama2 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llama.sh -model stories110M -build_tool cmake -dtype "${DTYPE}" -mode "${MODE}" + test-torchao-huggingface-checkpoints: + name: test-torchao-huggingface-checkpoints + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + permissions: + id-token: write + contents: read + strategy: + matrix: + model: [qwen3_4b, phi_4_mini] + include: + - model: qwen3_4b + test_with_runner: true + - model: phi_4_mini + test_with_runner: false + fail-fast: false + with: + runner: linux.2xlarge + docker-image: ci-image:executorch-ubuntu-22.04-clang12 + submodules: 'recursive' + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 900 + script: | + # The generic Linux job chooses to use base env, not the one setup by the image + CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") + conda activate "${CONDA_ENV}" + + PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool cmake + pip install -U "huggingface_hub[cli]" + + bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }} + # # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner. # test-llava-runner-macos: # name: test-llava-runner-macos @@ -993,13 +1024,13 @@ jobs: timeout: 60 script: | conda init powershell - + powershell -Command "& { Set-PSDebug -Trace 1 \$ErrorActionPreference = 'Stop' \$PSNativeCommandUseErrorActionPreference = \$true - .ci/scripts/setup-windows.ps1 + .ci/scripts/setup-windows.ps1 powershell .ci/scripts/test_model.ps1 -modelName ${{ matrix.model }} -backend ${{ matrix.backend }} - }" \ No newline at end of file + }"