@@ -108,6 +108,7 @@ jobs:
108108 declare -A DEVICE_POOL_ARNS
109109 DEVICE_POOL_ARNS[samsung_galaxy_s22]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
110110 DEVICE_POOL_ARNS[samsung_galaxy_s24]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/98f8788c-2e25-4a3c-8bb2-0d1e8897c0db"
111+ DEVICE_POOL_ARNS[google_pixel_8_pro]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/d65096ab-900b-4521-be8b-a3619b69236a"
111112
112113 # Resolve device names with their corresponding ARNs
113114 if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
@@ -168,18 +169,20 @@ jobs:
168169 name : export-models
169170 uses : pytorch/test-infra/.github/workflows/linux_job.yml@main
170171 needs : set-parameters
172+ secrets : inherit
171173 strategy :
172174 matrix :
173175 model : ${{ fromJson(needs.set-parameters.outputs.models) }}
174176 delegate : ${{ fromJson(needs.set-parameters.outputs.delegates) }}
175177 fail-fast : false
176178 with :
177- runner : linux.4xlarge
179+ runner : linux.2xlarge.memory
178180 docker-image : executorch-ubuntu-22.04-qnn-sdk
179181 submodules : ' true'
180182 timeout : 60
181183 upload-artifact : android-models
182184 upload-artifact-to-s3 : true
185+ secrets-env : EXECUTORCH_HF_TOKEN
183186 script : |
184187 # The generic Linux job chooses to use base env, not the one setup by the image
185188 echo "::group::Setting up dev environment"
@@ -190,14 +193,109 @@ jobs:
190193 PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
191194 fi
192195 PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
196+ # Install requirements for export_llama
197+ PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
193198 ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
194199 echo "::endgroup::"
195200
196201 echo "::group::Exporting ${{ matrix.delegate }} model: ${{ matrix.model }}"
197202 BUILD_MODE="cmake"
198- DTYPE="fp32"
199203
200- if [[ ${{ matrix.model }} =~ ^stories* ]]; then
204+ if [[ ${{ matrix.model }} =~ ^[^/]+/[^/]+$ ]]; then
205+ pip install -U "huggingface_hub[cli]"
206+ huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
207+ pip install accelerate sentencepiece
208+ # HuggingFace model. Assume the pattern is always like "<org>/<repo>"
209+ HF_MODEL_REPO=${{ matrix.model }}
210+ OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.delegate }}"
211+
212+ if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
213+ # Llama models on Hugging Face
214+ if [[ "$HF_MODEL_REPO" == *"SpinQuant"* ]]; then
215+ # SpinQuant
216+ # Download prequantized chceckpoint from Hugging Face
217+ DOWNLOADED_PATH=$(
218+ bash .ci/scripts/download_hf_hub.sh \
219+ --model_id "${HF_MODEL_REPO}" \
220+ --files "tokenizer.model" "params.json" "consolidated.00.pth"
221+ )
222+ # Export using ExecuTorch's model definition
223+ python -m examples.models.llama.export_llama \
224+ --model "llama3_2" \
225+ --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
226+ --params "${DOWNLOADED_PATH}/params.json" \
227+ --use_sdpa_with_kv_cache \
228+ -X \
229+ --xnnpack-extended-ops \
230+ --preq_mode 8da4w_output_8da8w \
231+ --preq_group_size 32 \
232+ --max_seq_length 2048 \
233+ --output_name "${OUT_ET_MODEL_NAME}.pte" \
234+ -kv \
235+ -d fp32 \
236+ --preq_embedding_quantize 8,0 \
237+ --use_spin_quant native \
238+ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
239+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
240+ elif [[ "$HF_MODEL_REPO" == *"QLORA"* ]]; then
241+ # QAT + LoRA
242+ # Download prequantized chceckpoint from Hugging Face
243+ DOWNLOADED_PATH=$(
244+ bash .ci/scripts/download_hf_hub.sh \
245+ --model_id "${HF_MODEL_REPO}" \
246+ --files "tokenizer.model" "params.json" "consolidated.00.pth"
247+ )
248+ # Export using ExecuTorch's model definition
249+ python -m examples.models.llama.export_llama \
250+ --model "llama3_2" \
251+ --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
252+ --params "${DOWNLOADED_PATH}/params.json" \
253+ -qat \
254+ -lora 16 \
255+ --preq_mode 8da4w_output_8da8w \
256+ --preq_group_size 32 \
257+ --preq_embedding_quantize 8,0 \
258+ --use_sdpa_with_kv_cache \
259+ -kv \
260+ -X \
261+ --xnnpack-extended-ops \
262+ -d fp32 \
263+ --max_seq_length 2048 \
264+ --output_name "${OUT_ET_MODEL_NAME}.pte" \
265+ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
266+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
267+ else
268+ if [[ ${{ matrix.delegate }} == "xnnpack_bf16" ]]; then
269+ # Original BF16 version, without any quantization
270+ DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
271+ python -m examples.models.llama.export_llama \
272+ --model "llama3_2" \
273+ --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
274+ --params "${DOWNLOADED_PATH}/params.json" \
275+ -kv \
276+ --use_sdpa_with_kv_cache \
277+ -X \
278+ -d bf16 \
279+ --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
280+ --output_name="${OUT_ET_MODEL_NAME}.pte"
281+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
282+ else
283+ # By default, test with the Hugging Face model and the xnnpack recipe
284+ DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
285+ python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
286+ ls -lh "${OUT_ET_MODEL_NAME}.pte"
287+ fi
288+ fi
289+ else
290+ echo "Unsupported model ${{ matrix.model }}"
291+ exit 1
292+ fi
293+
294+ zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
295+ ls -lh model.zip
296+ mkdir -p "${ARTIFACTS_DIR_NAME}"
297+ mv model.zip "${ARTIFACTS_DIR_NAME}"
298+ elif [[ ${{ matrix.model }} =~ ^stories* ]]; then
201299 # Install requirements for export_llama
202300 PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh
203301 # Test llama2
@@ -209,6 +307,7 @@ jobs:
209307 echo "Unsupported delegate ${{ matrix.delegate }}"
210308 exit 1
211309 fi
310+ DTYPE="fp32"
212311 PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
213312 -model "${{ matrix.model }}" \
214313 -build_tool "${BUILD_MODE}" \
0 commit comments