Skip to content

Commit 2c0752a

Browse files
authored
Improve optimum coverage in ET (more models, xnnpack on mac) (#13400)
### Summary Improves CI coverage of Optimum in ET: - More model coverage - Add XNNPack coverage for mac - Adds perplexity checks for causal LM tests instead of just printing output - Refactors all Optimum CI to use the same testing modules. ### Test plan Run trunk tests
1 parent 1b4968f commit 2c0752a

File tree

2 files changed

+148
-76
lines changed

2 files changed

+148
-76
lines changed

.ci/scripts/test_huggingface_optimum_model.py

Lines changed: 108 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,11 @@
11
import argparse
2+
import gc
3+
import logging
4+
import math
25
import subprocess
36
import tempfile
47
from pathlib import Path
8+
from typing import List
59

610
import torch
711
from datasets import load_dataset
@@ -15,6 +19,7 @@
1519
)
1620
from transformers import (
1721
AutoConfig,
22+
AutoModelForCausalLM,
1823
AutoModelForImageClassification,
1924
AutoProcessor,
2025
AutoTokenizer,
@@ -37,6 +42,56 @@ def cli_export(command, model_dir):
3742
print(f"Export failed with error: {e}")
3843

3944

45+
def check_causal_lm_output_quality(
46+
model_id: str, generated_tokens: List[int], max_perplexity_threshold: float = 100.0
47+
):
48+
"""
49+
Evaluates the quality of text generated by a causal language model by calculating its perplexity.
50+
51+
Args:
52+
model_id: HuggingFace model identifier (e.g., "google/gemma2-2b")
53+
generated_tokens: The tokens generated by the exported model to evaluate
54+
max_perplexity_threshold: Maximum acceptable perplexity (lower is better)
55+
56+
Returns:
57+
tuple: (is_quality_ok, reason) with boolean result and explanation
58+
"""
59+
logging.info(f"Starting perplexity check with model '{model_id}' ...")
60+
# Load model
61+
model = AutoModelForCausalLM.from_pretrained(
62+
model_id,
63+
low_cpu_mem_usage=True,
64+
use_cache=False,
65+
torch_dtype=torch.bfloat16,
66+
)
67+
68+
with torch.no_grad():
69+
outputs = model(input_ids=generated_tokens, labels=generated_tokens)
70+
71+
# Get the loss (negative log-likelihood)
72+
loss = outputs.loss.item()
73+
74+
# Calculate perplexity (exp of the average negative log-likelihood)
75+
perplexity = math.exp(loss)
76+
77+
is_quality_ok = perplexity <= max_perplexity_threshold
78+
if is_quality_ok:
79+
logging.info(
80+
f"✓ Perplexity check passed: {perplexity:.2f} <= {max_perplexity_threshold}"
81+
)
82+
else:
83+
logging.warning(
84+
f"✗ Perplexity check failed: {perplexity:.2f} > {max_perplexity_threshold}"
85+
)
86+
87+
# Clean up immediately
88+
del model
89+
del outputs
90+
gc.collect()
91+
92+
return is_quality_ok
93+
94+
4095
def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only=False):
4196
command = [
4297
"optimum-cli",
@@ -51,7 +106,19 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
51106
"--output_dir",
52107
model_dir,
53108
]
54-
if "coreml" in recipe:
109+
if "xnnpack" in recipe:
110+
command += [
111+
"--use_custom_sdpa",
112+
"--use_custom_kv_cache",
113+
]
114+
if quantize:
115+
command += [
116+
"--qlinear",
117+
"8da4w",
118+
"--qembedding",
119+
"8w",
120+
]
121+
elif "coreml" in recipe:
55122
command += [
56123
"--disable_dynamic_shapes",
57124
]
@@ -63,7 +130,9 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
63130
"8w",
64131
]
65132
else:
66-
assert not quantize, "Quantization is not supported for non-CoreML recipes yet"
133+
assert (
134+
not quantize
135+
), "Quantization is only supported for XnnPack and CoreML recipes at the moment."
67136

68137
if not run_only:
69138
cli_export(command, model_dir)
@@ -77,6 +146,14 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
77146
max_seq_len=64,
78147
)
79148
print(f"\nGenerated text:\n\t{generated_text}")
149+
generated_tokens = tokenizer(generated_text, return_tensors="pt").input_ids
150+
151+
# Free memory before loading eager for quality check
152+
del model
153+
del tokenizer
154+
gc.collect()
155+
156+
assert check_causal_lm_output_quality(model_id, generated_tokens) is True
80157

81158

82159
def test_fill_mask(model_id, model_dir, recipe, *, quantize=True, run_only=False):
@@ -278,23 +355,39 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False):
278355
)
279356
args = parser.parse_args()
280357

281-
model_to_model_id_and_test_function = {
282-
"smollm": ("HuggingFaceTB/SmolLM2-135M", test_text_generation), # works
283-
"qwen3": ("Qwen/Qwen3-0.6B", test_text_generation), # works
284-
"olmo": ("allenai/OLMo-1B-hf", test_text_generation), # works
285-
"gemma3": ("unsloth/gemma-3-1b-it", test_text_generation), # does not export
286-
"phi4": (
358+
_text_generation_mapping = {
359+
"llama3.2-1b": ("NousResearch/Llama-3.2-1B", test_text_generation),
360+
"qwen3-0.6b": ("Qwen/Qwen3-0.6B", test_text_generation),
361+
"qwen3-1.7b": ("Qwen/Qwen3-1.7B", test_text_generation),
362+
"gemma3-1b": (
363+
"unsloth/gemma-3-1b-it",
364+
test_text_generation,
365+
), # does not export for CoreML
366+
"phi4-mini": (
287367
"microsoft/Phi-4-mini-instruct",
288368
test_text_generation,
289-
), # fails to lower
290-
"llama3": ("NousResearch/Llama-3.2-1B", test_text_generation), # works
291-
"bert": ("google-bert/bert-base-uncased", test_fill_mask), # works
292-
"roberta": ("FacebookAI/xlmcl-roberta-base", test_fill_mask), # works
293-
"distilbert": ("distilbert/distilbert-base-uncased", test_fill_mask), # works
294-
"whisper": ("openai/whisper-tiny", test_whisper), # works
369+
), # fails to lower for CoreML
370+
"smollm2-135m": ("HuggingFaceTB/SmolLM2-135M", test_text_generation),
371+
"smollm3-3b": ("HuggingFaceTB/SmolLM3-3B", test_text_generation),
372+
"olmo": ("allenai/OLMo-1B-hf", test_text_generation),
373+
}
374+
375+
_mask_fill_mapping = {
376+
"bert": ("google-bert/bert-base-uncased", test_fill_mask),
377+
"roberta": ("FacebookAI/xlmcl-roberta-base", test_fill_mask),
378+
"distilbert": ("distilbert/distilbert-base-uncased", test_fill_mask),
379+
}
380+
381+
_misc_model_mapping = {
382+
"whisper": ("openai/whisper-tiny", test_whisper),
295383
"t5": ("google-t5/t5-small", test_t5), # CoreML runime failure
296-
"vit": ("google/vit-base-patch16-224", test_vit), # works
384+
"vit": ("google/vit-base-patch16-224", test_vit),
297385
}
386+
387+
model_to_model_id_and_test_function = (
388+
_text_generation_mapping | _mask_fill_mapping | _misc_model_mapping
389+
)
390+
298391
if args.model not in model_to_model_id_and_test_function:
299392
raise ValueError(
300393
f"Unknown model name: {args.model}. Available models: {model_to_model_id_and_test_function.keys()}"

.github/workflows/trunk.yml

Lines changed: 40 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -732,23 +732,26 @@ jobs:
732732
echo "::endgroup::"
733733
done
734734
735-
test-huggingface-transformers:
735+
test-huggingface-transformers-xnnpack:
736736
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
737737
if: ${{ !github.event.pull_request.head.repo.fork }}
738-
name: test-huggingface-transformers
738+
name: test-huggingface-transformers-xnnpack
739739
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
740740
permissions:
741741
id-token: write
742742
contents: read
743743
secrets: inherit
744744
strategy:
745745
matrix:
746-
hf_model_id: [
747-
google/gemma-3-1b-it,
748-
Qwen/Qwen3-0.6B,
749-
HuggingFaceTB/SmolLM2-135M,
750-
meta-llama/Llama-3.2-1B,
751-
allenai/OLMo-1B-hf,
746+
config: [
747+
# XNNPack.
748+
llama3.2-1b|xnnpack|--quantize,
749+
qwen3-0.6b|xnnpack|--quantize,
750+
qwen3-1.7b|xnnpack|--quantize,
751+
gemma3-1b|xnnpack|--quantize,
752+
phi4-mini|xnnpack|--quantize,
753+
smollm2-135m|xnnpack|--quantize,
754+
smollm3-3b|xnnpack|--quantize
752755
]
753756
fail-fast: false
754757
with:
@@ -760,6 +763,12 @@ jobs:
760763
timeout: 90
761764
upload-artifact: profiling-artifacts-${{ strategy.job-index }}
762765
script: |
766+
set -eux
767+
IFS='|' read -r MODEL RECIPE QUANTIZE <<< "${{ matrix.config }}"
768+
echo "Model: $MODEL"
769+
echo "Recipe: $RECIPE"
770+
echo "Quantize: $QUANTIZE"
771+
763772
echo "::group::Set up ExecuTorch"
764773
# The generic Linux job chooses to use base env, not the one setup by the image
765774
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@@ -797,82 +806,52 @@ jobs:
797806
pip list
798807
echo "::endgroup::"
799808
800-
echo "::group::Export to ExecuTorch"
801-
# Pass matrix variable as environment variable
802-
export MODEL_ID="${{ matrix.hf_model_id }}"
803-
export OUTPUT_DIR="$(pwd)/${MODEL_ID}_custom_sdpa_kv_cache_8da4w"
804-
pushd optimum-executorch
805-
806-
ARGS=(
807-
"--model" "${MODEL_ID}"
808-
"--task" "text-generation"
809-
"--recipe" "xnnpack"
810-
"--use_custom_sdpa"
811-
"--use_custom_kv_cache"
812-
"--qlinear" "8da4w"
813-
"--qembedding" "8w"
814-
"--output_dir" "${OUTPUT_DIR}"
815-
)
816-
817-
optimum-cli export executorch "${ARGS[@]}"
818-
819-
ls -FlAGhp ${OUTPUT_DIR}
820-
popd
821-
echo "::endgroup::"
822-
823-
echo "::group::Inference using python API"
824-
pushd optimum-executorch
825-
python -c "
826-
import os
827-
from optimum.executorch import ExecuTorchModelForCausalLM
828-
from transformers import AutoTokenizer
829-
830-
model_id = os.getenv('MODEL_ID')
831-
pte_dir = os.getenv('OUTPUT_DIR')
832-
print(f'Loading model {model_id} from {pte_dir}.')
833-
model = ExecuTorchModelForCausalLM.from_pretrained(pte_dir)
834-
generated_text = model.text_generation(
835-
tokenizer=AutoTokenizer.from_pretrained(model_id),
836-
prompt='Simply put, the theory of relativity states that',
837-
max_seq_len=64
838-
)
839-
print(generated_text)
840-
"
841-
popd
809+
echo "::group::Run tests"
810+
export OUTPUT_DIR="$(pwd)/${MODEL}_${RECIPE}_${QUANTIZE}"
811+
python .ci/scripts/test_huggingface_optimum_model.py --model ${MODEL} --recipe ${RECIPE} ${QUANTIZE} --model_dir ${OUTPUT_DIR}
842812
echo "::endgroup::"
843813
844-
echo "::group::Inference using executor_runner with ETDump"
814+
echo "::group::Generate artifacts for performance profiling"
845815
./cmake-out/executor_runner \
846816
--model_path ${OUTPUT_DIR}/model.pte \
847817
--etdump_path ${OUTPUT_DIR}/etdump.etdp
848818
849-
export TSV_PATH=artifacts-to-be-uploaded/${MODEL_ID}_op_prof.tsv
819+
export TSV_PATH=artifacts-to-be-uploaded/${MODEL}_op_prof.tsv
850820
mkdir -p $(dirname "$TSV_PATH")
851821
python3 -m devtools.inspector.inspector_cli \
852822
--etdump_path ${OUTPUT_DIR}/etdump.etdp \
853823
--tsv_path ${TSV_PATH}
854-
855824
echo "::endgroup::"
856825
857-
test-huggingface-optimum-coreml:
826+
test-huggingface-transformers-coreml:
858827
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
859828
if: ${{ !github.event.pull_request.head.repo.fork }}
860-
name: test-huggingface-optimum-coreml
829+
name: test-huggingface-transformers-coreml
861830
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
862831
permissions:
863832
id-token: write
864833
contents: read
865834
secrets: inherit
835+
# Models below selected based on https://huggingface.co/models?pipeline_tag=text-generation&num_parameters=min:0,max:3B&sort=trending.
866836
strategy:
867837
matrix:
868838
config: [
869-
qwen3|coreml_fp32_gpu|--quantize,
870-
smollm|coreml_fp32_gpu|--quantize,
871-
llama3|coreml_fp32_gpu|--quantize,
872-
olmo|coreml_fp32_gpu|--quantize,
873-
# roberta|coreml_fp32_gpu|--quantize, roberta requires special HF access
839+
# XNNPack.
840+
llama3.2-1b|xnnpack|--quantize,
841+
qwen3-0.6b|xnnpack|--quantize,
842+
qwen3-1.7b|xnnpack|--quantize,
843+
gemma3-1b|xnnpack|--quantize,
844+
phi4-mini|xnnpack|--quantize,
845+
smollm2-135m|xnnpack|--quantize,
846+
smollm3-3b|xnnpack|--quantize,
847+
# CoreML.
848+
llama3.2-1b|coreml_fp32_gpu|--quantize,
849+
qwen3-0.6b|coreml_fp32_gpu|--quantize,
850+
qwen3-1.7b|xnnpack|--quantize,
851+
smollm2-135m|coreml_fp32_gpu|--quantize,
852+
olmo-1b|coreml_fp32_gpu|--quantize,
874853
bert|coreml_fp32_gpu|--quantize,
875-
distilbert|coreml_fp32_gpu|--quantize,
854+
distilbert|coreml_fp32_gpu|--quantize
876855
]
877856
fail-fast: false
878857
with:

0 commit comments

Comments
 (0)