From ef3b9ed5719ede8cdd6c211e3f629373634ebeb2 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Wed, 13 Aug 2025 17:34:43 -0700 Subject: [PATCH 1/7] Improve optimum coverage in ET (more models, xnnpack on mac) --- .ci/scripts/test_huggingface_optimum_model.py | 119 +++++++++++++++--- .github/workflows/trunk.yml | 110 +++++++--------- 2 files changed, 150 insertions(+), 79 deletions(-) diff --git a/.ci/scripts/test_huggingface_optimum_model.py b/.ci/scripts/test_huggingface_optimum_model.py index 6a31eabb0c8..589d49ef475 100644 --- a/.ci/scripts/test_huggingface_optimum_model.py +++ b/.ci/scripts/test_huggingface_optimum_model.py @@ -1,7 +1,11 @@ import argparse +import gc +import logging +import math import subprocess import tempfile from pathlib import Path +from typing import List import torch from datasets import load_dataset @@ -15,6 +19,7 @@ ) from transformers import ( AutoConfig, + AutoModelForCausalLM, AutoModelForImageClassification, AutoProcessor, AutoTokenizer, @@ -37,6 +42,56 @@ def cli_export(command, model_dir): print(f"Export failed with error: {e}") +def check_causal_lm_output_quality( + model_id: str, generated_tokens: List[int], max_perplexity_threshold: float = 100.0 +): + """ + Evaluates the quality of text generated by a causal language model by calculating its perplexity. + + Args: + model_id: HuggingFace model identifier (e.g., "google/gemma2-2b") + generated_tokens: The tokens generated by the exported model to evaluate + max_perplexity_threshold: Maximum acceptable perplexity (lower is better) + + Returns: + tuple: (is_quality_ok, reason) with boolean result and explanation + """ + logging.info(f"Starting perplexity check with model '{model_id}' ...") + # Load model + model = AutoModelForCausalLM.from_pretrained( + model_id, + low_cpu_mem_usage=True, + use_cache=False, + torch_dtype=torch.bfloat16, + ) + + with torch.no_grad(): + outputs = model(input_ids=generated_tokens, labels=generated_tokens) + + # Get the loss (negative log-likelihood) + loss = outputs.loss.item() + + # Calculate perplexity (exp of the average negative log-likelihood) + perplexity = math.exp(loss) + + is_quality_ok = perplexity <= max_perplexity_threshold + if is_quality_ok: + logging.info( + f"✓ Perplexity check passed: {perplexity:.2f} <= {max_perplexity_threshold}" + ) + else: + logging.warning( + f"✗ Perplexity check failed: {perplexity:.2f} > {max_perplexity_threshold}" + ) + + # Clean up immediately + del model + del outputs + gc.collect() + + return is_quality_ok + + def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only=False): command = [ "optimum-cli", @@ -51,7 +106,15 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only "--output_dir", model_dir, ] - if "coreml" in recipe: + if "xnnpack" in recipe: + if quantize: + command += [ + "--qlinear", + "8da4w", + "--qembedding", + "8w", + ] + elif "coreml" in recipe: command += [ "--disable_dynamic_shapes", ] @@ -63,7 +126,9 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only "8w", ] else: - assert not quantize, "Quantization is not supported for non-CoreML recipes yet" + assert ( + not quantize + ), "Quantization is only supported for XnnPack and CoreML recipes at the moment." if not run_only: cli_export(command, model_dir) @@ -77,6 +142,14 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only max_seq_len=64, ) print(f"\nGenerated text:\n\t{generated_text}") + generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids + + # Free memory before loading eager for quality check + del model + del tokenizer + gc.collect() + + assert check_causal_lm_output_quality(model_id, generated_tokens) is True def test_fill_mask(model_id, model_dir, recipe, *, quantize=True, run_only=False): @@ -278,23 +351,39 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False): ) args = parser.parse_args() - model_to_model_id_and_test_function = { - "smollm": ("HuggingFaceTB/SmolLM2-135M", test_text_generation), # works - "qwen3": ("Qwen/Qwen3-0.6B", test_text_generation), # works - "olmo": ("allenai/OLMo-1B-hf", test_text_generation), # works - "gemma3": ("unsloth/gemma-3-1b-it", test_text_generation), # does not export - "phi4": ( + _text_generation_mapping = { + "llama3.2-1b": ("NousResearch/Llama-3.2-1B", test_text_generation), + "qwen3-0.6b": ("Qwen/Qwen3-0.6B", test_text_generation), + "qwen3-1.7b": ("Qwen/Qwen3-1.7B", test_text_generation), + "gemma3-1b": ( + "unsloth/gemma-3-1b-it", + test_text_generation, + ), # does not export for CoreML + "phi4-mini": ( "microsoft/Phi-4-mini-instruct", test_text_generation, - ), # fails to lower - "llama3": ("NousResearch/Llama-3.2-1B", test_text_generation), # works - "bert": ("google-bert/bert-base-uncased", test_fill_mask), # works - "roberta": ("FacebookAI/xlmcl-roberta-base", test_fill_mask), # works - "distilbert": ("distilbert/distilbert-base-uncased", test_fill_mask), # works - "whisper": ("openai/whisper-tiny", test_whisper), # works + ), # fails to lower for CoreML + "smollm2-135m": ("HuggingFaceTB/SmolLM2-135M", test_text_generation), + "smollm3-3b": ("HuggingFaceTB/SmolLM3-3B", test_text_generation), + "olmo": ("allenai/OLMo-1B-hf", test_text_generation), + } + + _mask_fill_mapping = { + "bert": ("google-bert/bert-base-uncased", test_fill_mask), + "roberta": ("FacebookAI/xlmcl-roberta-base", test_fill_mask), + "distilbert": ("distilbert/distilbert-base-uncased", test_fill_mask), + } + + _misc_model_mapping = { + "whisper": ("openai/whisper-tiny", test_whisper), "t5": ("google-t5/t5-small", test_t5), # CoreML runime failure - "vit": ("google/vit-base-patch16-224", test_vit), # works + "vit": ("google/vit-base-patch16-224", test_vit), } + + model_to_model_id_and_test_function = ( + _text_generation_mapping | _mask_fill_mapping | _misc_model_mapping + ) + if args.model not in model_to_model_id_and_test_function: raise ValueError( f"Unknown model name: {args.model}. Available models: {model_to_model_id_and_test_function.keys()}" diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 34a955b88a9..12225c971c6 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -732,24 +732,29 @@ jobs: echo "::endgroup::" done - test-huggingface-transformers: + test-huggingface-transformers-xnnpack: # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway if: ${{ !github.event.pull_request.head.repo.fork }} - name: test-huggingface-transformers + name: test-huggingface-transformers-xnnpack uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main permissions: id-token: write contents: read secrets: inherit + # Models below selected based on https://huggingface.co/models?pipeline_tag=text-generation&num_parameters=min:0,max:3B&sort=trending. + xnnpack: &xnnpack + - llama3.2-1b|xnnpack|--quantize + - qwen3-0.6b|xnnpack|--quantize + - qwen3-1.7b|xnnpack|--quantize + - gemma3-1b|xnnpack|--quantize + - phi-4-mini|xnnpack|--quantize + - smollm2-135m|xnnpack|--quantize + - smollm3-3b|xnnpack|--quantize strategy: matrix: - hf_model_id: [ - google/gemma-3-1b-it, - Qwen/Qwen3-0.6B, - HuggingFaceTB/SmolLM2-135M, - meta-llama/Llama-3.2-1B, - allenai/OLMo-1B-hf, - ] + config: + - *xnnpack + - *coreml fail-fast: false with: secrets-env: EXECUTORCH_HF_TOKEN @@ -760,6 +765,12 @@ jobs: timeout: 90 upload-artifact: profiling-artifacts-${{ strategy.job-index }} script: | + set -eux + IFS='|' read -r MODEL RECIPE QUANTIZE <<< "${{ matrix.config }}" + echo "Model: $MODEL" + echo "Recipe: $RECIPE" + echo "Quantize: $QUANTIZE" + echo "::group::Set up ExecuTorch" # The generic Linux job chooses to use base env, not the one setup by the image CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") @@ -797,51 +808,11 @@ jobs: pip list echo "::endgroup::" - echo "::group::Export to ExecuTorch" - # Pass matrix variable as environment variable - export MODEL_ID="${{ matrix.hf_model_id }}" - export OUTPUT_DIR="$(pwd)/${MODEL_ID}_custom_sdpa_kv_cache_8da4w" - pushd optimum-executorch - - ARGS=( - "--model" "${MODEL_ID}" - "--task" "text-generation" - "--recipe" "xnnpack" - "--use_custom_sdpa" - "--use_custom_kv_cache" - "--qlinear" "8da4w" - "--qembedding" "8w" - "--output_dir" "${OUTPUT_DIR}" - ) - - optimum-cli export executorch "${ARGS[@]}" - - ls -FlAGhp ${OUTPUT_DIR} - popd - echo "::endgroup::" - - echo "::group::Inference using python API" - pushd optimum-executorch - python -c " - import os - from optimum.executorch import ExecuTorchModelForCausalLM - from transformers import AutoTokenizer - - model_id = os.getenv('MODEL_ID') - pte_dir = os.getenv('OUTPUT_DIR') - print(f'Loading model {model_id} from {pte_dir}.') - model = ExecuTorchModelForCausalLM.from_pretrained(pte_dir) - generated_text = model.text_generation( - tokenizer=AutoTokenizer.from_pretrained(model_id), - prompt='Simply put, the theory of relativity states that', - max_seq_len=64 - ) - print(generated_text) - " - popd + echo "::group::Run tests" + python .ci/scripts/test_huggingface_optimum_model.py --model ${MODEL} --recipe ${RECIPE} ${QUANTIZE} echo "::endgroup::" - echo "::group::Inference using executor_runner with ETDump" + echo "::group::Generate artifacts for performance profiling" ./cmake-out/executor_runner \ --model_path ${OUTPUT_DIR}/model.pte \ --etdump_path ${OUTPUT_DIR}/etdump.etdp @@ -851,29 +822,40 @@ jobs: python3 -m devtools.inspector.inspector_cli \ --etdump_path ${OUTPUT_DIR}/etdump.etdp \ --tsv_path ${TSV_PATH} - echo "::endgroup::" - test-huggingface-optimum-coreml: + test-huggingface-transformers-coreml: # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway if: ${{ !github.event.pull_request.head.repo.fork }} - name: test-huggingface-optimum-coreml + name: test-huggingface-transformers-coreml uses: pytorch/test-infra/.github/workflows/macos_job.yml@main permissions: id-token: write contents: read secrets: inherit + # Models below selected based on https://huggingface.co/models?pipeline_tag=text-generation&num_parameters=min:0,max:3B&sort=trending. + xnnpack: &xnnpack + - llama3.2-1b|xnnpack|--quantize + - qwen3-0.6b|xnnpack|--quantize + - qwen3-1.7b|xnnpack|--quantize + - gemma3-1b|xnnpack|--quantize + - phi-4-mini|xnnpack|--quantize + - smollm2-135m|xnnpack|--quantize + - smollm3-3b|xnnpack|--quantize + coreml: &coreml + - llama3.2-1b|coreml_fp32_gpu|--quantize, + - qwen3-0.6b|coreml_fp32_gpu|--quantize, + - qwen3-0.6b|coreml_fp32_gpu|--quantize, + - smollm2-135m|coreml_fp32_gpu|--quantize, + - olmo-1b|coreml_fp32_gpu|--quantize, + # - roberta|coreml_fp32_gpu|--quantize, roberta requires special HF access + - bert|coreml_fp32_gpu|--quantize, + - distilbert|coreml_fp32_gpu|--quantize, strategy: matrix: - config: [ - qwen3|coreml_fp32_gpu|--quantize, - smollm|coreml_fp32_gpu|--quantize, - llama3|coreml_fp32_gpu|--quantize, - olmo|coreml_fp32_gpu|--quantize, - # roberta|coreml_fp32_gpu|--quantize, roberta requires special HF access - bert|coreml_fp32_gpu|--quantize, - distilbert|coreml_fp32_gpu|--quantize, - ] + config: + - *xnnpack + - *coreml fail-fast: false with: secrets-env: EXECUTORCH_HF_TOKEN From 79b2841f94133df95b208a363702771030ae13eb Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Wed, 13 Aug 2025 21:04:53 -0700 Subject: [PATCH 2/7] Fix trunk.yml formatting --- .github/workflows/trunk.yml | 60 +++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 32 deletions(-) diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 12225c971c6..8ed07d44378 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -741,20 +741,18 @@ jobs: id-token: write contents: read secrets: inherit - # Models below selected based on https://huggingface.co/models?pipeline_tag=text-generation&num_parameters=min:0,max:3B&sort=trending. - xnnpack: &xnnpack - - llama3.2-1b|xnnpack|--quantize - - qwen3-0.6b|xnnpack|--quantize - - qwen3-1.7b|xnnpack|--quantize - - gemma3-1b|xnnpack|--quantize - - phi-4-mini|xnnpack|--quantize - - smollm2-135m|xnnpack|--quantize - - smollm3-3b|xnnpack|--quantize strategy: matrix: - config: - - *xnnpack - - *coreml + config: [ + # XNNPack. + llama3.2-1b|xnnpack|--quantize, + qwen3-0.6b|xnnpack|--quantize, + qwen3-1.7b|xnnpack|--quantize, + gemma3-1b|xnnpack|--quantize, + phi-4-mini|xnnpack|--quantize, + smollm2-135m|xnnpack|--quantize, + smollm3-3b|xnnpack|--quantize + ] fail-fast: false with: secrets-env: EXECUTORCH_HF_TOKEN @@ -834,28 +832,26 @@ jobs: contents: read secrets: inherit # Models below selected based on https://huggingface.co/models?pipeline_tag=text-generation&num_parameters=min:0,max:3B&sort=trending. - xnnpack: &xnnpack - - llama3.2-1b|xnnpack|--quantize - - qwen3-0.6b|xnnpack|--quantize - - qwen3-1.7b|xnnpack|--quantize - - gemma3-1b|xnnpack|--quantize - - phi-4-mini|xnnpack|--quantize - - smollm2-135m|xnnpack|--quantize - - smollm3-3b|xnnpack|--quantize - coreml: &coreml - - llama3.2-1b|coreml_fp32_gpu|--quantize, - - qwen3-0.6b|coreml_fp32_gpu|--quantize, - - qwen3-0.6b|coreml_fp32_gpu|--quantize, - - smollm2-135m|coreml_fp32_gpu|--quantize, - - olmo-1b|coreml_fp32_gpu|--quantize, - # - roberta|coreml_fp32_gpu|--quantize, roberta requires special HF access - - bert|coreml_fp32_gpu|--quantize, - - distilbert|coreml_fp32_gpu|--quantize, strategy: matrix: - config: - - *xnnpack - - *coreml + config: [ + # XNNPack. + llama3.2-1b|xnnpack|--quantize, + qwen3-0.6b|xnnpack|--quantize, + qwen3-1.7b|xnnpack|--quantize, + gemma3-1b|xnnpack|--quantize, + phi-4-mini|xnnpack|--quantize, + smollm2-135m|xnnpack|--quantize, + smollm3-3b|xnnpack|--quantize, + # CoreML. + llama3.2-1b|coreml_fp32_gpu|--quantize, + qwen3-0.6b|coreml_fp32_gpu|--quantize, + qwen3-1.7b|xnnpack|--quantize, + smollm2-135m|coreml_fp32_gpu|--quantize, + olmo-1b|coreml_fp32_gpu|--quantize, + bert|coreml_fp32_gpu|--quantize, + distilbert|coreml_fp32_gpu|--quantize + ] fail-fast: false with: secrets-env: EXECUTORCH_HF_TOKEN From d993828939fd98c3bd6ebd0664ccd4aa812a81ec Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 14 Aug 2025 09:51:36 -0700 Subject: [PATCH 3/7] Fix trunk.yml --- .github/workflows/trunk.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 8ed07d44378..f528e3845e5 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -807,7 +807,8 @@ jobs: echo "::endgroup::" echo "::group::Run tests" - python .ci/scripts/test_huggingface_optimum_model.py --model ${MODEL} --recipe ${RECIPE} ${QUANTIZE} + export OUTPUT_DIR="$(pwd)/${MODEL_ID}_${RECIPE}_${QUANTIZE}" + python .ci/scripts/test_huggingface_optimum_model.py --model ${MODEL} --recipe ${RECIPE} ${QUANTIZE} --model_dir ${OUTPUT_DIR} echo "::endgroup::" echo "::group::Generate artifacts for performance profiling" From 9eefe7fffe4ee6d0575829e4eb870eff442f7144 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 14 Aug 2025 09:52:50 -0700 Subject: [PATCH 4/7] Try pinning llvm-openmp --- .ci/docker/conda-env-ci.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/docker/conda-env-ci.txt b/.ci/docker/conda-env-ci.txt index 995f9635a0d..bcde7fd3fb8 100644 --- a/.ci/docker/conda-env-ci.txt +++ b/.ci/docker/conda-env-ci.txt @@ -1,5 +1,5 @@ cmake=3.31.2 ninja=1.10.2 libuv -llvm-openmp +llvm-openmp=19.1.7 pkg-config From 36e718d544bad7536d679220d2c4afc0a50157ac Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Thu, 14 Aug 2025 17:29:06 -0700 Subject: [PATCH 5/7] Revert "Try pinning llvm-openmp" This reverts commit 9eefe7fffe4ee6d0575829e4eb870eff442f7144. --- .ci/docker/conda-env-ci.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/docker/conda-env-ci.txt b/.ci/docker/conda-env-ci.txt index bcde7fd3fb8..995f9635a0d 100644 --- a/.ci/docker/conda-env-ci.txt +++ b/.ci/docker/conda-env-ci.txt @@ -1,5 +1,5 @@ cmake=3.31.2 ninja=1.10.2 libuv -llvm-openmp=19.1.7 +llvm-openmp pkg-config From cd3b4fb34957007c028cbd95a7aa9e1bd466c20f Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Fri, 15 Aug 2025 08:48:05 -0700 Subject: [PATCH 6/7] Fix trunk --- .github/workflows/trunk.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index f528e3845e5..ee17524acce 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -749,7 +749,7 @@ jobs: qwen3-0.6b|xnnpack|--quantize, qwen3-1.7b|xnnpack|--quantize, gemma3-1b|xnnpack|--quantize, - phi-4-mini|xnnpack|--quantize, + phi4-mini|xnnpack|--quantize, smollm2-135m|xnnpack|--quantize, smollm3-3b|xnnpack|--quantize ] @@ -807,7 +807,7 @@ jobs: echo "::endgroup::" echo "::group::Run tests" - export OUTPUT_DIR="$(pwd)/${MODEL_ID}_${RECIPE}_${QUANTIZE}" + export OUTPUT_DIR="$(pwd)/${MODEL}_${RECIPE}_${QUANTIZE}" python .ci/scripts/test_huggingface_optimum_model.py --model ${MODEL} --recipe ${RECIPE} ${QUANTIZE} --model_dir ${OUTPUT_DIR} echo "::endgroup::" @@ -816,7 +816,7 @@ jobs: --model_path ${OUTPUT_DIR}/model.pte \ --etdump_path ${OUTPUT_DIR}/etdump.etdp - export TSV_PATH=artifacts-to-be-uploaded/${MODEL_ID}_op_prof.tsv + export TSV_PATH=artifacts-to-be-uploaded/${MODEL}_op_prof.tsv mkdir -p $(dirname "$TSV_PATH") python3 -m devtools.inspector.inspector_cli \ --etdump_path ${OUTPUT_DIR}/etdump.etdp \ @@ -841,7 +841,7 @@ jobs: qwen3-0.6b|xnnpack|--quantize, qwen3-1.7b|xnnpack|--quantize, gemma3-1b|xnnpack|--quantize, - phi-4-mini|xnnpack|--quantize, + phi4-mini|xnnpack|--quantize, smollm2-135m|xnnpack|--quantize, smollm3-3b|xnnpack|--quantize, # CoreML. From a0c373fdc77737495ffa1f5ca3f805e70d335fc3 Mon Sep 17 00:00:00 2001 From: Jack Zhang <32371937+jackzhxng@users.noreply.github.com> Date: Fri, 15 Aug 2025 09:16:08 -0700 Subject: [PATCH 7/7] Use custom kv cache and sdpa for xnnpack --- .ci/scripts/test_huggingface_optimum_model.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.ci/scripts/test_huggingface_optimum_model.py b/.ci/scripts/test_huggingface_optimum_model.py index 589d49ef475..cd7a7c2124e 100644 --- a/.ci/scripts/test_huggingface_optimum_model.py +++ b/.ci/scripts/test_huggingface_optimum_model.py @@ -107,6 +107,10 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only model_dir, ] if "xnnpack" in recipe: + command += [ + "--use_custom_sdpa", + "--use_custom_kv_cache", + ] if quantize: command += [ "--qlinear",