Skip to content

Commit 6da53ef

Browse files
Guang YangGithub Executorch
authored andcommitted
Add compatible HuggingFace models to benchmark workflow
1 parent 8460d42 commit 6da53ef

File tree

6 files changed

+302
-23
lines changed

6 files changed

+302
-23
lines changed

.ci/scripts/download_hf_hub.sh

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
#!/bin/bash
2+
3+
# Function to download files from the Hugging Face Hub
4+
# Arguments:
5+
# 1. model_id: The Hugging Face repository ID (e.g., "organization/model_name")
6+
# 2. subdir: The optional subdirectory in the repo to look for files (pass "" if not used)
7+
# 3. file_names: A space-separated list of filenames to be downloaded
8+
# Returns:
9+
# The directory containing the downloaded files
10+
function download_hf_files() {
11+
local model_id="$1"
12+
local subdir="$2"
13+
shift 2
14+
local file_names=("$@") # Capture all remaining arguments as an array
15+
16+
local download_dir
17+
18+
# Use the first file to determine the download directory
19+
download_dir=$(python3 -c "
20+
from huggingface_hub import hf_hub_download
21+
# Download the first file and get its directory
22+
path = hf_hub_download(
23+
repo_id='${model_id}',
24+
filename='${subdir:+${subdir}/}${file_names[0]}'
25+
)
26+
import os
27+
print(os.path.dirname(path))")
28+
29+
if [ $? -ne 0 ]; then
30+
echo "Error: Failed to determine download directory from ${file_names[0]}" >&2
31+
return 1
32+
fi
33+
34+
# Download remaining files into the same directory
35+
for file_name in "${file_names[@]:1}"; do
36+
python3 -c "
37+
from huggingface_hub import hf_hub_download
38+
# Download the file
39+
hf_hub_download(
40+
repo_id='${model_id}',
41+
filename='${subdir:+${subdir}/}${file_name}'
42+
)"
43+
44+
if [ $? -ne 0 ]; then
45+
echo "Error: Failed to download ${file_name} from ${model_id}" >&2
46+
return 1
47+
fi
48+
done
49+
50+
# Return the directory containing the downloaded files
51+
echo "$download_dir"
52+
}
53+
54+
# Check if script is called directly
55+
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
56+
# Parse arguments from CLI
57+
while [[ $# -gt 0 ]]; do
58+
case $1 in
59+
--model_id)
60+
MODEL_ID="$2"
61+
shift 2
62+
;;
63+
--subdir)
64+
SUBDIR="$2"
65+
shift 2
66+
;;
67+
--files)
68+
shift
69+
FILES_TO_DOWNLOAD=()
70+
while [[ $# -gt 0 && $1 != --* ]]; do
71+
FILES_TO_DOWNLOAD+=("$1")
72+
shift
73+
done
74+
;;
75+
*)
76+
echo "Unknown option: $1" >&2
77+
exit 1
78+
;;
79+
esac
80+
done
81+
82+
# Validate required arguments
83+
if [ -z "$MODEL_ID" ] || [ ${#FILES_TO_DOWNLOAD[@]} -eq 0 ]; then
84+
echo "Usage: $0 --model_id <model_id> --subdir <subdir> --files <file1> [<file2> ...]" >&2
85+
exit 1
86+
fi
87+
88+
# Call the function
89+
DOWNLOAD_DIR=$(download_hf_files "$MODEL_ID" "$SUBDIR" "${FILES_TO_DOWNLOAD[@]}")
90+
if [ $? -eq 0 ]; then
91+
echo "$DOWNLOAD_DIR"
92+
else
93+
exit 1
94+
fi
95+
fi

.github/workflows/android-perf.yml

Lines changed: 101 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ jobs:
8686
# Separate default values from the workflow dispatch. To ensure defaults are accessible
8787
# during scheduled runs and to provide flexibility for different defaults between
8888
# on-demand and periodic benchmarking.
89-
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'stories110M,dl3,mv3,mv2,ic4,ic3,vit' || 'stories110M' }}
89+
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'stories110M,dl3,mv3,mv2,ic4,ic3,vit,meta-llama/Llama-3.2-1B' || 'stories110M' }}
9090
CRON_DEFAULT_DEVICES: samsung_galaxy_s22
9191
CRON_DEFAULT_DELEGATES: ${{ github.event_name == 'schedule' && 'xnnpack,qnn' || 'xnnpack' }}
9292
run: |
@@ -108,6 +108,7 @@ jobs:
108108
declare -A DEVICE_POOL_ARNS
109109
DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
110110
DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db"
111+
DEVICE_POOL_ARNS[google_pixel_8_pro]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a"
111112
112113
# Resolve device names with their corresponding ARNs
113114
if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
@@ -168,18 +169,20 @@ jobs:
168169
name: export-models
169170
uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
170171
needs: set-parameters
172+
secrets: inherit
171173
strategy:
172174
matrix:
173175
model: ${{ fromJson(needs.set-parameters.outputs.models) }}
174176
delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
175177
fail-fast: false
176178
with:
177-
runner: linux.4xlarge
179+
runner: linux.4xlarge.memory
178180
docker-image: executorch-ubuntu-22.04-qnn-sdk
179181
submodules: 'true'
180182
timeout: 60
181183
upload-artifact: android-models
182184
upload-artifact-to-s3: true
185+
secrets-env: EXECUTORCH_HF_TOKEN
183186
script: |
184187
# The generic Linux job chooses to use base env, not the one setup by the image
185188
echo "::group::Setting up dev environment"
@@ -195,9 +198,102 @@ jobs:
195198
196199
echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
197200
BUILD_MODE="cmake"
198-
DTYPE="fp32"
199201
200-
if [[ ${{ matrix.model }} =~ ^stories* ]]; then
202+
if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
203+
pip install -U "huggingface_hub[cli]"
204+
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
205+
pip install accelerate sentencepiece tiktoken
206+
# HuggingFace model. Assume the pattern is always like "<org>/<repo>"
207+
HF_MODEL_REPO=${{ matrix.model }}
208+
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
209+
210+
if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
211+
# Llama models on Hugging Face
212+
if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
213+
# SpinQuant
214+
# Download prequantized chceckpoint from Hugging Face
215+
DOWNLOADED_PATH=$(
216+
bash .ci/scripts/download_hf_hub.sh \
217+
--model_id "${HF_MODEL_REPO}" \
218+
--files "tokenizer.model" "params.json" "consolidated.00.pth"
219+
)
220+
# Export using ExecuTorch's model definition
221+
python -m examples.models.llama.export_llama \
222+
--model "llama3_2" \
223+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
224+
--params "${DOWNLOADED_PATH}/params.json" \
225+
--use_sdpa_with_kv_cache \
226+
-X \
227+
--xnnpack-extended-ops \
228+
--preq_mode 8da4w_output_8da8w \
229+
--preq_group_size 32 \
230+
--max_seq_length 2048 \
231+
--output_name "${OUT_ET_MODEL_NAME}.pte" \
232+
-kv \
233+
-d fp32 \
234+
--preq_embedding_quantize 8,0 \
235+
--use_spin_quant native \
236+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
237+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
238+
elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
239+
# QAT + LoRA
240+
# Download prequantized chceckpoint from Hugging Face
241+
DOWNLOADED_PATH=$(
242+
bash .ci/scripts/download_hf_hub.sh \
243+
--model_id "${HF_MODEL_REPO}" \
244+
--files "tokenizer.model" "params.json" "consolidated.00.pth"
245+
)
246+
# Export using ExecuTorch's model definition
247+
python -m examples.models.llama.export_llama \
248+
--model "llama3_2" \
249+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
250+
--params "${DOWNLOADED_PATH}/params.json" \
251+
-qat \
252+
-lora 16 \
253+
--preq_mode 8da4w_output_8da8w \
254+
--preq_group_size 32 \
255+
--preq_embedding_quantize 8,0 \
256+
--use_sdpa_with_kv_cache \
257+
-kv \
258+
-X \
259+
--xnnpack-extended-ops \
260+
-d fp32 \
261+
--max_seq_length 2048 \
262+
--output_name "${OUT_ET_MODEL_NAME}.pte" \
263+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
264+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
265+
else
266+
if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
267+
# Original BF16 version, without any quantization
268+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
269+
python -m examples.models.llama.export_llama \
270+
--model "llama3_2" \
271+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
272+
--params "${DOWNLOADED_PATH}/params.json" \
273+
-kv \
274+
--use_sdpa_with_kv_cache \
275+
-X \
276+
-d bf16 \
277+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
278+
--output_name="{OUT_ET_MODEL_NAME}.pte"
279+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
280+
else
281+
# By default, test with the Hugging Face model and the xnnpack recipe
282+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
283+
python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
284+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
285+
fi
286+
fi
287+
else
288+
echo "Unsupported model ${{ matrix.model }}"
289+
exit 1
290+
fi
291+
292+
zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
293+
ls -lh model.zip
294+
mkdir -p "${ARTIFACTS_DIR_NAME}"
295+
mv model.zip "${ARTIFACTS_DIR_NAME}"
296+
elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
201297
# Install requirements for export_llama
202298
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
203299
# Test llama2
@@ -209,6 +305,7 @@ jobs:
209305
echo "Unsupported delegate ${{ matrix.delegate }}"
210306
exit 1
211307
fi
308+
DTYPE="fp32"
212309
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
213310
-model "${{ matrix.model }}" \
214311
-build_tool "${BUILD_MODE}" \

.github/workflows/apple-perf.yml

Lines changed: 99 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ jobs:
7676
# Separate default values from the workflow dispatch. To ensure defaults are accessible
7777
# during scheduled runs and to provide flexibility for different defaults between
7878
# on-demand and periodic benchmarking.
79-
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'stories110M,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l' || 'stories110M' }}
79+
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'stories110M,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B' || 'stories110M' }}
8080
CRON_DEFAULT_DEVICES: apple_iphone_15
8181
CRON_DEFAULT_DELEGATES: ${{ github.event_name == 'schedule' && 'xnnpack,coreml,mps' || 'xnnpack' }}
8282
run: |
@@ -155,6 +155,7 @@ jobs:
155155
name: export-models
156156
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
157157
needs: set-parameters
158+
secrets: inherit
158159
strategy:
159160
matrix:
160161
model: ${{ fromJson(needs.set-parameters.outputs.models) }}
@@ -168,6 +169,7 @@ jobs:
168169
timeout: 60
169170
upload-artifact: ios-models
170171
upload-artifact-to-s3: true
172+
secrets-env: EXECUTORCH_HF_TOKEN
171173
script: |
172174
set -eux
173175
@@ -194,9 +196,102 @@ jobs:
194196
195197
echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
196198
BUILD_MODE="cmake"
197-
DTYPE="fp32"
198199
199-
if [[ ${{ matrix.model }} =~ ^stories* ]]; then
200+
if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
201+
pip install -U "huggingface_hub[cli]"
202+
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
203+
${CONDA_RUN} pip install accelerate sentencepiece tiktoken
204+
# HuggingFace model. Assume the pattern is always like "<org>/<repo>"
205+
HF_MODEL_REPO=${{ matrix.model }}
206+
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
207+
208+
if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
209+
# Llama models on Hugging Face
210+
if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
211+
# SpinQuant
212+
# Download prequantized chceckpoint from Hugging Face
213+
DOWNLOADED_PATH=$(
214+
bash .ci/scripts/download_hf_hub.sh \
215+
--model_id "${HF_MODEL_REPO}" \
216+
--files "tokenizer.model" "params.json" "consolidated.00.pth"
217+
)
218+
# Export using ExecuTorch's model definition
219+
${CONDA_RUN} python -m examples.models.llama.export_llama \
220+
--model "llama3_2" \
221+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
222+
--params "${DOWNLOADED_PATH}/params.json" \
223+
--use_sdpa_with_kv_cache \
224+
-X \
225+
--xnnpack-extended-ops \
226+
--preq_mode 8da4w_output_8da8w \
227+
--preq_group_size 32 \
228+
--max_seq_length 2048 \
229+
--output_name "${OUT_ET_MODEL_NAME}.pte" \
230+
-kv \
231+
-d fp32 \
232+
--preq_embedding_quantize 8,0 \
233+
--use_spin_quant native \
234+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
235+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
236+
elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
237+
# QAT + LoRA
238+
# Download prequantized chceckpoint from Hugging Face
239+
DOWNLOADED_PATH=$(
240+
bash .ci/scripts/download_hf_hub.sh \
241+
--model_id "${HF_MODEL_REPO}" \
242+
--files "tokenizer.model" "params.json" "consolidated.00.pth"
243+
)
244+
# Export using ExecuTorch's model definition
245+
${CONDA_RUN} python -m examples.models.llama.export_llama \
246+
--model "llama3_2" \
247+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
248+
--params "${DOWNLOADED_PATH}/params.json" \
249+
-qat \
250+
-lora 16 \
251+
--preq_mode 8da4w_output_8da8w \
252+
--preq_group_size 32 \
253+
--preq_embedding_quantize 8,0 \
254+
--use_sdpa_with_kv_cache \
255+
-kv \
256+
-X \
257+
--xnnpack-extended-ops \
258+
-d fp32 \
259+
--max_seq_length 2048 \
260+
--output_name "${OUT_ET_MODEL_NAME}.pte" \
261+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
262+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
263+
else
264+
if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
265+
# Original BF16 version, without any quantization
266+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
267+
${CONDA_RUN} python -m examples.models.llama.export_llama \
268+
--model "llama3_2" \
269+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
270+
--params "${DOWNLOADED_PATH}/params.json" \
271+
-kv \
272+
--use_sdpa_with_kv_cache \
273+
-X \
274+
-d bf16 \
275+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
276+
--output_name="{OUT_ET_MODEL_NAME}.pte"
277+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
278+
else
279+
# By default, test with the Hugging Face model and the xnnpack recipe
280+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
281+
${CONDA_RUN} python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
282+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
283+
fi
284+
fi
285+
else
286+
echo "Unsupported model ${{ matrix.model }}"
287+
exit 1
288+
fi
289+
290+
zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
291+
ls -lh model.zip
292+
mkdir -p "${ARTIFACTS_DIR_NAME}"
293+
mv model.zip "${ARTIFACTS_DIR_NAME}"
294+
elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
200295
# Install requirements for export_llama
201296
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
202297
bash examples/models/llama/install_requirements.sh
@@ -209,6 +304,7 @@ jobs:
209304
elif [[ ${{ matrix.delegate }} == "mps" ]]; then
210305
DELEGATE_CONFIG="mps"
211306
fi
307+
DTYPE="fp32"
212308
PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
213309
bash .ci/scripts/test_llama.sh \
214310
-model "${{ matrix.model }}" \

0 commit comments

Comments
 (0)