diff --git a/.ci/scripts/gather_benchmark_configs.py b/.ci/scripts/gather_benchmark_configs.py index 0fe60a0d772..c15a64e3ede 100755 --- a/.ci/scripts/gather_benchmark_configs.py +++ b/.ci/scripts/gather_benchmark_configs.py @@ -43,6 +43,7 @@ "coreml_fp16", "mps", "llama3_coreml_ane", + "llama3_mps_8da4w", ], } diff --git a/.github/workflows/apple-perf.yml b/.github/workflows/apple-perf.yml index ea88be441cb..83f03323e9a 100644 --- a/.github/workflows/apple-perf.yml +++ b/.github/workflows/apple-perf.yml @@ -298,6 +298,23 @@ jobs: --coreml-compute-units cpu_and_ne \ --output_name="${OUT_ET_MODEL_NAME}.pte" ls -lh "${OUT_ET_MODEL_NAME}.pte" + elif [[ ${{ matrix.config }} == "llama3_mps_8da4w" ]]; then + # MPS 8da4w + DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth") + ${CONDA_RUN} python -m examples.models.llama.export_llama \ + --model "llama3_2" \ + --checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \ + --params "${DOWNLOADED_PATH}/params.json" \ + -kv \ + --use_sdpa_with_kv_cache \ + --disable_dynamic_shape \ + --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \ + --mps \ + -qmode 8da4w \ + --group_size 32 \ + --embedding-quantize 4,32 \ + --output_name="${OUT_ET_MODEL_NAME}.pte" + ls -lh "${OUT_ET_MODEL_NAME}.pte" else # By default, test with the Hugging Face model and the xnnpack recipe DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")