Skip to content

Commit c16f488

Browse files
committed
up
1 parent cea9ec1 commit c16f488

File tree

2 files changed

+294
-49
lines changed

2 files changed

+294
-49
lines changed
Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,276 @@
1+
import subprocess
2+
import argparse
3+
from pathlib import Path
4+
5+
import torch
6+
from datasets import load_dataset
7+
from transformers import AutoProcessor, AutoTokenizer, AutoModelForImageClassification, AutoConfig
8+
9+
from optimum.executorch import (
10+
ExecuTorchModelForCausalLM,
11+
ExecuTorchModelForMaskedLM,
12+
ExecuTorchModelForSeq2SeqLM,
13+
ExecuTorchModelForSpeechSeq2Seq,
14+
ExecuTorchModelForImageClassification,
15+
)
16+
import tempfile
17+
18+
19+
def cli_export(command, model_dir):
20+
p = Path(model_dir)
21+
if p.exists():
22+
if not p.is_dir():
23+
raise Exception(f"Path {model_dir} already exists and is not a directory.")
24+
if any(p.iterdir()):
25+
raise Exception(f"Existing directory {model_dir} is non-empty. Please remove it first.")
26+
try:
27+
subprocess.run(command, check=True)
28+
print("Export completed successfully.")
29+
except subprocess.CalledProcessError as e:
30+
print(f"Export failed with error: {e}")
31+
32+
33+
def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only=False):
34+
command = [
35+
"optimum-cli",
36+
"export",
37+
"executorch",
38+
"--model",
39+
model_id,
40+
"--task",
41+
"text-generation",
42+
"--recipe",
43+
recipe,
44+
"--output_dir",
45+
model_dir,
46+
]
47+
if "coreml" in recipe:
48+
command += [
49+
"--disable_dynamic_shapes",
50+
]
51+
if quantize:
52+
command += [
53+
"--qlinear", "4w",
54+
"--qembedding", "8w",
55+
]
56+
else:
57+
assert not quantize, "Quantization is not supported for non-CoreML recipes yet"
58+
59+
if not run_only:
60+
cli_export(command, model_dir)
61+
62+
tokenizer = AutoTokenizer.from_pretrained(model_id)
63+
tokenizer.save_pretrained(model_dir)
64+
model = ExecuTorchModelForCausalLM.from_pretrained(model_dir)
65+
generated_text = model.text_generation(
66+
tokenizer=tokenizer,
67+
prompt="Simply put, the theory of relativity states that",
68+
max_seq_len=64,
69+
)
70+
print(f"\nGenerated text:\n\t{generated_text}")
71+
72+
73+
def test_fill_mask(model_id, model_dir, recipe, *, quantize=True, run_only=False):
74+
command = [
75+
"optimum-cli",
76+
"export",
77+
"executorch",
78+
"--model",
79+
model_id,
80+
"--task",
81+
"fill-mask",
82+
"--recipe",
83+
recipe,
84+
"--output_dir",
85+
model_dir,
86+
]
87+
if "coreml" in recipe and quantize:
88+
command += [
89+
"--qlinear", "4w",
90+
"--qembedding", "8w",
91+
]
92+
else:
93+
assert not quantize, "Quantization is not supported for non-CoreML recipes yet"
94+
95+
if not run_only:
96+
cli_export(command, model_dir)
97+
98+
tokenizer = AutoTokenizer.from_pretrained(model_id)
99+
model = ExecuTorchModelForMaskedLM.from_pretrained(model_dir)
100+
input_text = f"Paris is the {tokenizer.mask_token} of France."
101+
inputs = tokenizer(
102+
input_text,
103+
return_tensors="pt",
104+
padding="max_length",
105+
max_length=10,
106+
)
107+
108+
# Test inference using ExecuTorch model
109+
exported_outputs = model.forward(inputs["input_ids"], inputs["attention_mask"])
110+
predicted_masks = tokenizer.decode(exported_outputs[0, 4].topk(5).indices)
111+
print(f"\nInput text:\n\t{input_text}\nPredicted masks:\n\t{predicted_masks}")
112+
113+
114+
def test_t5(model_id, model_dir, recipe, *, quantize=False, run_only=False):
115+
assert not quantize, "Quantization is not supported for T5 model yet"
116+
117+
assert model_id == "google-t5/t5-small"
118+
command = [
119+
"optimum-cli",
120+
"export",
121+
"executorch",
122+
"--model",
123+
model_id,
124+
"--task",
125+
"text2text-generation",
126+
"--recipe",
127+
recipe,
128+
"--output_dir",
129+
model_dir,
130+
]
131+
if not run_only:
132+
cli_export(command, model_dir)
133+
134+
tokenizer = AutoTokenizer.from_pretrained(model_id)
135+
model = ExecuTorchModelForSeq2SeqLM.from_pretrained(model_dir)
136+
article = (
137+
" New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. A"
138+
" year later, she got married again in Westchester County, but to a different man and without divorcing"
139+
" her first husband. Only 18 days after that marriage, she got hitched yet again. Then, Barrientos"
140+
' declared "I do" five more times, sometimes only within two weeks of each other. In 2010, she married'
141+
" once more, this time in the Bronx. In an application for a marriage license, she stated it was her"
142+
' "first and only" marriage. Barrientos, now 39, is facing two criminal counts of "offering a false'
143+
' instrument for filing in the first degree," referring to her false statements on the 2010 marriage'
144+
" license application, according to court documents. Prosecutors said the marriages were part of an"
145+
" immigration scam. On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to"
146+
" her attorney, Christopher Wright, who declined to comment further. After leaving court, Barrientos was"
147+
" arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New"
148+
" York subway through an emergency exit, said Detective Annette Markowski, a police spokeswoman. In total,"
149+
" Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. All"
150+
" occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be"
151+
" married to four men, and at one time, she was married to eight men at once, prosecutors say. Prosecutors"
152+
" said the immigration scam involved some of her husbands, who filed for permanent residence status"
153+
" shortly after the marriages. Any divorces happened only after such filings were approved. It was"
154+
" unclear whether any of the men will be prosecuted. The case was referred to the Bronx District"
155+
" Attorney's Office by Immigration and Customs Enforcement and the Department of Homeland Security's"
156+
' Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt,'
157+
" Turkey, Georgia, Pakistan and Mali. Her eighth husband, Rashid Rajput, was deported in 2006 to his"
158+
" native Pakistan after an investigation by the Joint Terrorism Task Force."
159+
)
160+
article = "summarize: " + article.strip()
161+
162+
tokenizer = AutoTokenizer.from_pretrained(model_id)
163+
generated_text = model.text_generation(
164+
tokenizer=tokenizer,
165+
prompt=article,
166+
)
167+
expected_text = 'a year later, she got married again in westchester county, new york. she was married to a different man, but only 18 days after that marriage. she is facing two criminal counts of "offering a false instrument"'
168+
print(f"Generated text:\n\t{generated_text}")
169+
print(f"Expected text:\n\t{expected_text}")
170+
171+
172+
def test_whisper(model_id, model_dir, recipe, *, quantize=False, run_only=False):
173+
assert not quantize, "Quantization is not supported for whisper model yet"
174+
175+
assert model_id == "openai/whisper-tiny"
176+
command = [
177+
"optimum-cli",
178+
"export",
179+
"executorch",
180+
"--model",
181+
model_id,
182+
"--task",
183+
"automatic-speech-recognition",
184+
"--recipe",
185+
recipe,
186+
"--output_dir",
187+
model_dir,
188+
]
189+
if not run_only:
190+
cli_export(command, model_dir)
191+
192+
tokenizer = AutoTokenizer.from_pretrained(model_id)
193+
model = ExecuTorchModelForSpeechSeq2Seq.from_pretrained(model_dir)
194+
processor = AutoProcessor.from_pretrained(model_id)
195+
dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
196+
sample = dataset[0]["audio"]
197+
198+
input_features = processor(
199+
sample["array"],
200+
return_tensors="pt",
201+
truncation=False,
202+
sampling_rate=sample["sampling_rate"],
203+
).input_features
204+
205+
# Current implementation of the transcibe method accepts up to 30 seconds of audio, therefore I trim the audio here.
206+
input_features_trimmed = input_features[:, :, :3000].contiguous()
207+
208+
generated_transcription = model.transcribe(tokenizer, input_features_trimmed)
209+
expected_text = " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similarly drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Latins work is really Greek after all, and can discover that."
210+
print(f"Generated transcription: {generated_transcription}")
211+
print(f"Expected transcription: {expected_text}")
212+
213+
def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False):
214+
assert not quantize, "Quantization is not supported for ViT models yet."
215+
216+
assert model_id == "google/vit-base-patch16-224"
217+
command = [
218+
"optimum-cli",
219+
"export",
220+
"executorch",
221+
"--model",
222+
model_id,
223+
"--task",
224+
"image-classification",
225+
"--recipe",
226+
recipe,
227+
"--output_dir",
228+
model_dir,
229+
]
230+
if not run_only:
231+
cli_export(command, model_dir)
232+
233+
config = AutoConfig.from_pretrained(model_id)
234+
batch_size = 1
235+
num_channels = config.num_channels
236+
height = config.image_size
237+
width = config.image_size
238+
pixel_values = torch.rand(batch_size, num_channels, height, width)
239+
240+
# Test fetching and lowering the model to ExecuTorch
241+
et_model = ExecuTorchModelForImageClassification.from_pretrained(model_id=model_dir)
242+
eager_model = AutoModelForImageClassification.from_pretrained(model_id).eval().to("cpu")
243+
with torch.no_grad():
244+
eager_output = eager_model(pixel_values)
245+
et_output = et_model.forward(pixel_values)
246+
247+
assert torch.allclose(eager_output.logits, et_output, atol=1e-02, rtol=1e-02), "CoreML output does not match eager"
248+
249+
250+
if __name__ == "__main__":
251+
parser = argparse.ArgumentParser()
252+
parser.add_argument("--model", type=str, required=True)
253+
parser.add_argument("--recipe", type=str, required=True)
254+
parser.add_argument("--quantize", action="store_true", help="Enable quantization")
255+
args = parser.parse_args()
256+
257+
model_to_model_id_and_test_function = {
258+
"smollm": ("HuggingFaceTB/SmolLM2-135M", test_text_generation), # works
259+
"qwen3": ("Qwen/Qwen3-0.6B", test_text_generation), # works
260+
"olmo": ("allenai/OLMo-1B-hf", test_text_generation), # works
261+
"gemma3": ("unsloth/gemma-3-1b-it", test_text_generation), # does not export
262+
"phi4": ("microsoft/Phi-4-mini-instruct", test_text_generation), # fails to lower
263+
"llama3": ("NousResearch/Llama-3.2-1B", test_text_generation), # works
264+
"bert": ("google-bert/bert-base-uncased", test_fill_mask), # works
265+
"roberta": ("FacebookAI/xlmcl-roberta-base", test_fill_mask), # works
266+
"distilbert": ("distilbert/distilbert-base-uncased", test_fill_mask), # works
267+
"whisper": ("openai/whisper-tiny", test_whisper), # works
268+
"t5": ("google-t5/t5-small", test_t5), # CoreML runime failure
269+
"vit": ("google/vit-base-patch16-224", test_vit), # works
270+
}
271+
if args.model not in model_to_model_id_and_test_function:
272+
raise ValueError(f"Unknown model name: {args.model}. Available models: {model_to_model_id_and_test_function.keys()}")
273+
274+
with tempfile.TemporaryDirectory() as tmp_dir:
275+
model_id, test_fn = model_to_model_id_and_test_function[args.model]
276+
test_fn(model_id=model_id, model_dir=tmp_dir, recipe=args.recipe, quantize=args.quantize)

.github/workflows/trunk.yml

Lines changed: 18 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -799,22 +799,26 @@ jobs:
799799
800800
echo "::endgroup::"
801801
802-
test-huggingface-optimum-text-generation-coreml:
802+
test-huggingface-optimum-coreml:
803803
# NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
804804
if: ${{ !github.event.pull_request.head.repo.fork }}
805-
name: test-huggingface-optimum-text-generation-coreml
805+
name: test-huggingface-optimum-coreml
806806
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
807807
permissions:
808808
id-token: write
809809
contents: read
810810
secrets: inherit
811811
strategy:
812812
matrix:
813-
hf_model_id: [
814-
Qwen/Qwen3-0.6B,
815-
HuggingFaceTB/SmolLM2-135M,
816-
meta-llama/Llama-3.2-1B,
817-
allenai/OLMo-1B-hf,
813+
config: [
814+
qwen3|coreml_fp32_gpu|--quantize,
815+
smollm|coreml_fp32_gpu|--quantize,
816+
llama3|coreml_fp32_gpu|--quantize,
817+
olmo|coreml_fp32_gpu|--quantize,
818+
bert|coreml_fp32_gpu|--quantize,
819+
roberta|coreml_fp32_gpu|--quantize,
820+
distilbert|coreml_fp32_gpu|--quantize,
821+
vit|coreml_fp16_ne
818822
]
819823
fail-fast: false
820824
with:
@@ -826,6 +830,11 @@ jobs:
826830
timeout: 90
827831
script: |
828832
set -eux
833+
IFS='|' read -r MODEL RECIPE QUANTIZE <<< "${{ matrix.config }}"
834+
echo "Model: $MODEL"
835+
echo "Recipe: $RECIPE"
836+
echo "Quantize: $QUANTIZE"
837+
829838
echo "::group::Set up ExecuTorch"
830839
bash .ci/scripts/setup-conda.sh
831840
eval "$(conda shell.bash hook)"
@@ -852,48 +861,8 @@ jobs:
852861
${CONDA_RUN} pip list
853862
echo "::endgroup::"
854863
855-
echo "::group::Export to ExecuTorch"
856-
# Pass matrix variable as environment variable
857-
export MODEL_ID="${{ matrix.hf_model_id }}"
858-
export OUTPUT_DIR="$(pwd)/${MODEL_ID}_coreml"
859-
pushd optimum-executorch
860-
861-
ARGS=(
862-
"--model" "${MODEL_ID}"
863-
"--task" "text-generation"
864-
"--recipe" "coreml_fp32_gpu"
865-
"--qlinear" "4w"
866-
"--qembedding" "8w"
867-
"--disable_dynamic_shapes"
868-
"--output_dir" "${OUTPUT_DIR}"
869-
)
870-
871-
${CONDA_RUN} optimum-cli export executorch "${ARGS[@]}"
872-
873-
ls -FlAGhp ${OUTPUT_DIR}
874-
popd
875-
echo "::endgroup::"
876-
877-
echo "::group::Inference using python API"
878-
pushd optimum-executorch
879-
${CONDA_RUN} python -c "
880-
import os
881-
from optimum.executorch import ExecuTorchModelForCausalLM
882-
from transformers import AutoTokenizer
883-
884-
model_id = os.getenv('MODEL_ID')
885-
pte_dir = os.getenv('OUTPUT_DIR')
886-
print(f'Loading model {model_id} from {pte_dir}.')
887-
model = ExecuTorchModelForCausalLM.from_pretrained(pte_dir)
888-
generated_text = model.text_generation(
889-
tokenizer=AutoTokenizer.from_pretrained(model_id),
890-
prompt='Simply put, the theory of relativity states that',
891-
max_seq_len=64
892-
)
893-
print(generated_text)
894-
"
895-
popd
896-
echo "::endgroup::"
864+
# Run test
865+
${CONDA_RUN} python .ci/scripts/test_huggingface_optimum_coreml.py --model ${MODEL} --recipe ${RECIPE} ${QUANTIZE}
897866
898867
test-llama-runner-qnn-linux:
899868
name: test-llama-runner-qnn-linux

0 commit comments

Comments
 (0)