8686 # Separate default values from the workflow dispatch. To ensure defaults are accessible
8787 # during scheduled runs and to provide flexibility for different defaults between
8888 # on-demand and periodic benchmarking.
89- CRON_DEFAULT_MODELS : ${{ github.event_name == 'schedule' && 'stories110M,dl3,mv3,mv2,ic4,ic3,vit' || 'stories110M' }}
89+ CRON_DEFAULT_MODELS : ${{ github.event_name == 'schedule' && 'stories110M,dl3,mv3,mv2,ic4,ic3,vit,meta-llama/Llama-3.2-1B ' || 'stories110M' }}
9090 CRON_DEFAULT_DEVICES : samsung_galaxy_s22
9191 CRON_DEFAULT_DELEGATES : ${{ github.event_name == 'schedule' && 'xnnpack,qnn' || 'xnnpack' }}
9292 run : |
@@ -108,6 +108,7 @@ jobs:
108108 declare -A DEVICE_POOL_ARNS
109109 DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
110110 DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db"
111+ DEVICE_POOL_ARNS[google_pixel_8_pro]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a"
111112
112113 # Resolve device names with their corresponding ARNs
113114 if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
@@ -168,18 +169,20 @@ jobs:
168169 name : export-models
169170 uses : pytorch/test-infra/.github/workflows/linux_job.yml@main
170171 needs : set-parameters
172+ secrets : inherit
171173 strategy :
172174 matrix :
173175 model : ${{ fromJson(needs.set-parameters.outputs.models) }}
174176 delegate : ${{ fromJson(needs.set-parameters.outputs.delegates) }}
175177 fail-fast : false
176178 with :
177- runner : linux.4xlarge
179+ runner : linux.4xlarge.memory
178180 docker-image : executorch-ubuntu-22.04-qnn-sdk
179181 submodules : ' true'
180182 timeout : 60
181183 upload-artifact : android-models
182184 upload-artifact-to-s3 : true
185+ secrets-env : EXECUTORCH_HF_TOKEN
183186 script : |
184187 # The generic Linux job chooses to use base env, not the one setup by the image
185188 echo "::group::Setting up dev environment"
@@ -195,9 +198,102 @@ jobs:
195198
196199 echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
197200 BUILD_MODE="cmake"
198- DTYPE="fp32"
199201
200- if [[ ${{ matrix.model }} =~ ^stories* ]]; then
202+ if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
203+ pip install -U "huggingface_hub[cli]"
204+ huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
205+ pip install accelerate sentencepiece tiktoken
206+ # HuggingFace model. Assume the pattern is always like "<org>/<repo>"
207+ HF_MODEL_REPO=${{ matrix.model }}
208+ OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
209+
210+ if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
211+ # Llama models on Hugging Face
212+ if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
213+ # SpinQuant
214+ # Download prequantized chceckpoint from Hugging Face
215+ DOWNLOADED_PATH=$(
216+ bash .ci/scripts/download_hf_hub.sh \
217+ --model_id "${HF_MODEL_REPO}" \
218+ --files "tokenizer.model" "params.json" "consolidated.00.pth"
219+ )
220+ # Export using ExecuTorch's model definition
221+ python -m examples.models.llama.export_llama \
222+ --model "llama3_2" \
223+ --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
224+ --params "${DOWNLOADED_PATH}/params.json" \
225+ --use_sdpa_with_kv_cache \
226+ -X \
227+ --xnnpack-extended-ops \
228+ --preq_mode 8da4w_output_8da8w \
229+ --preq_group_size 32 \
230+ --max_seq_length 2048 \
231+ --output_name "${OUT_ET_MODEL_NAME}.pte" \
232+ -kv \
233+ -d fp32 \
234+ --preq_embedding_quantize 8,0 \
235+ --use_spin_quant native \
236+ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
237+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
238+ elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
239+ # QAT + LoRA
240+ # Download prequantized chceckpoint from Hugging Face
241+ DOWNLOADED_PATH=$(
242+ bash .ci/scripts/download_hf_hub.sh \
243+ --model_id "${HF_MODEL_REPO}" \
244+ --files "tokenizer.model" "params.json" "consolidated.00.pth"
245+ )
246+ # Export using ExecuTorch's model definition
247+ python -m examples.models.llama.export_llama \
248+ --model "llama3_2" \
249+ --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
250+ --params "${DOWNLOADED_PATH}/params.json" \
251+ -qat \
252+ -lora 16 \
253+ --preq_mode 8da4w_output_8da8w \
254+ --preq_group_size 32 \
255+ --preq_embedding_quantize 8,0 \
256+ --use_sdpa_with_kv_cache \
257+ -kv \
258+ -X \
259+ --xnnpack-extended-ops \
260+ -d fp32 \
261+ --max_seq_length 2048 \
262+ --output_name "${OUT_ET_MODEL_NAME}.pte" \
263+ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
264+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
265+ else
266+ if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
267+ # Original BF16 version, without any quantization
268+ DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
269+ python -m examples.models.llama.export_llama \
270+ --model "llama3_2" \
271+ --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
272+ --params "${DOWNLOADED_PATH}/params.json" \
273+ -kv \
274+ --use_sdpa_with_kv_cache \
275+ -X \
276+ -d bf16 \
277+ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
278+ --output_name="{OUT_ET_MODEL_NAME}.pte"
279+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
280+ else
281+ # By default, test with the Hugging Face model and the xnnpack recipe
282+ DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
283+ python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
284+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
285+ fi
286+ fi
287+ else
288+ echo "Unsupported model ${{ matrix.model }}"
289+ exit 1
290+ fi
291+
292+ zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
293+ ls -lh model.zip
294+ mkdir -p "${ARTIFACTS_DIR_NAME}"
295+ mv model.zip "${ARTIFACTS_DIR_NAME}"
296+ elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
201297 # Install requirements for export_llama
202298 PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
203299 # Test llama2
@@ -209,6 +305,7 @@ jobs:
209305 echo "Unsupported delegate ${{ matrix.delegate }}"
210306 exit 1
211307 fi
308+ DTYPE="fp32"
212309 PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
213310 -model "${{ matrix.model }}" \
214311 -build_tool "${BUILD_MODE}" \
0 commit comments