Skip to content

Commit c4868e0

Browse files
committed
Add CI jobs
1 parent b1081a3 commit c4868e0

File tree

9 files changed

+264
-73
lines changed

9 files changed

+264
-73
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
828ae02053a6e0e20a2dfd6e737ba10c6f4dee6b
1+
bd06b54e627fbfd354a2cffa4c80fb21883209a9

.ci/scripts/test_huggingface_optimum_model.py

Lines changed: 133 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,9 @@ def cli_export(command, model_dir):
4343

4444

4545
def check_causal_lm_output_quality(
46-
model_id: str, generated_tokens: List[int], max_perplexity_threshold: float = 100.0
46+
model_id: str,
47+
generated_tokens: List[int],
48+
max_perplexity_threshold: float = 100.0,
4749
):
4850
"""
4951
Evaluates the quality of text generated by a causal language model by calculating its perplexity.
@@ -58,12 +60,24 @@ def check_causal_lm_output_quality(
5860
"""
5961
logging.info(f"Starting perplexity check with model '{model_id}' ...")
6062
# Load model
61-
model = AutoModelForCausalLM.from_pretrained(
62-
model_id,
63-
low_cpu_mem_usage=True,
64-
use_cache=False,
65-
torch_dtype=torch.bfloat16,
66-
)
63+
cls_name = AutoModelForCausalLM
64+
if "llava" in model_id:
65+
from transformers import LlavaForConditionalGeneration
66+
67+
cls_name = LlavaForConditionalGeneration
68+
try:
69+
model = cls_name.from_pretrained(
70+
model_id,
71+
low_cpu_mem_usage=True,
72+
use_cache=False,
73+
torch_dtype=torch.bfloat16,
74+
)
75+
except TypeError:
76+
model = cls_name.from_pretrained(
77+
model_id,
78+
low_cpu_mem_usage=True,
79+
torch_dtype=torch.bfloat16,
80+
)
6781

6882
with torch.no_grad():
6983
outputs = model(input_ids=generated_tokens, labels=generated_tokens)
@@ -156,6 +170,105 @@ def test_text_generation(model_id, model_dir, recipe, *, quantize=True, run_only
156170
assert check_causal_lm_output_quality(model_id, generated_tokens) is True
157171

158172

173+
def test_llm_with_image_modality(
174+
model_id, model_dir, recipe, *, quantize=True, run_only=False
175+
):
176+
command = [
177+
"optimum-cli",
178+
"export",
179+
"executorch",
180+
"--model",
181+
model_id,
182+
"--task",
183+
"multimodal-text-to-text",
184+
"--recipe",
185+
recipe,
186+
"--output_dir",
187+
model_dir,
188+
"--use_custom_sdpa",
189+
"--use_custom_kv_cache",
190+
"--qlinear",
191+
"8da4w",
192+
"--qembedding",
193+
"8w",
194+
]
195+
if not run_only:
196+
cli_export(command, model_dir)
197+
198+
tokenizer = AutoTokenizer.from_pretrained(model_id)
199+
tokenizer.save_pretrained(model_dir)
200+
201+
# input
202+
processor = AutoProcessor.from_pretrained(model_id)
203+
image_url = "https://llava-vl.github.io/static/images/view.jpg"
204+
conversation = [
205+
{
206+
"role": "system",
207+
"content": [
208+
{
209+
"type": "text",
210+
"text": "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.",
211+
}
212+
],
213+
},
214+
{
215+
"role": "user",
216+
"content": [
217+
{"type": "image", "url": image_url},
218+
{
219+
"type": "text",
220+
"text": "What are the things I should be cautious about when I visit here?",
221+
},
222+
],
223+
},
224+
]
225+
inputs = processor.apply_chat_template(
226+
conversation,
227+
add_generation_prompt=True,
228+
tokenize=True,
229+
return_dict=True,
230+
return_tensors="pt",
231+
)
232+
233+
import torch
234+
235+
first_image_id_index = torch.where(inputs["input_ids"] == processor.image_token_id)[
236+
1
237+
][0].item()
238+
last_image_id_index = torch.where(inputs["input_ids"] == processor.image_token_id)[
239+
1
240+
][-1].item()
241+
242+
prompt_before_image = inputs["input_ids"][0, :first_image_id_index]
243+
prompt_after_image = inputs["input_ids"][0, last_image_id_index + 1 :]
244+
from executorch.extension.llm.runner import (
245+
GenerationConfig,
246+
make_image_input,
247+
make_token_input,
248+
MultimodalRunner,
249+
)
250+
251+
combined_inputs = [
252+
make_token_input(prompt_before_image.tolist()),
253+
make_image_input(inputs["pixel_values"]),
254+
make_token_input(prompt_after_image.tolist()),
255+
]
256+
runner = MultimodalRunner(f"{model_dir}/model.pte", f"{model_dir}/tokenizer.model")
257+
generated_text = runner.generate_text(
258+
combined_inputs, GenerationConfig(max_new_tokens=128, temperature=0, echo=False)
259+
)
260+
print(f"\nGenerated text:\n\t{generated_text}")
261+
# Free memory before loading eager for quality check
262+
del runner
263+
gc.collect()
264+
assert (
265+
check_causal_lm_output_quality(
266+
model_id, tokenizer.encode(generated_text, return_tensors="pt")
267+
)
268+
is True
269+
)
270+
271+
159272
def test_fill_mask(model_id, model_dir, recipe, *, quantize=True, run_only=False):
160273
command = [
161274
"optimum-cli",
@@ -353,6 +466,9 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False):
353466
required=False,
354467
help="When provided, write the pte file to this directory. Otherwise, a temporary directory is created for the test.",
355468
)
469+
parser.add_argument(
470+
"--run_only", action="store_true", help="Skip export and only run the test"
471+
)
356472
args = parser.parse_args()
357473

358474
_text_generation_mapping = {
@@ -384,8 +500,16 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False):
384500
"vit": ("google/vit-base-patch16-224", test_vit),
385501
}
386502

503+
_multimodal_model_mapping = {
504+
"gemma3-4b": ("google/gemma-3-4b-it", test_llm_with_image_modality),
505+
"llava": ("llava-hf/llava-1.5-7b-hf", test_llm_with_image_modality),
506+
}
507+
387508
model_to_model_id_and_test_function = (
388-
_text_generation_mapping | _mask_fill_mapping | _misc_model_mapping
509+
_text_generation_mapping
510+
| _mask_fill_mapping
511+
| _misc_model_mapping
512+
| _multimodal_model_mapping
389513
)
390514

391515
if args.model not in model_to_model_id_and_test_function:
@@ -400,4 +524,5 @@ def test_vit(model_id, model_dir, recipe, *, quantize=False, run_only=False):
400524
model_dir=tmp_dir if args.model_dir is None else args.model_dir,
401525
recipe=args.recipe,
402526
quantize=args.quantize,
527+
run_only=args.run_only,
403528
)

.github/workflows/pull.yml

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -286,15 +286,20 @@ jobs:
286286
# Test selective build
287287
PYTHON_EXECUTABLE=python bash examples/selective_build/test_selective_build.sh "${BUILD_TOOL}"
288288
289-
test-llava-runner-linux:
290-
name: test-llava-runner-linux
289+
test-multimodal-linux:
290+
if: ${{ !github.event.pull_request.head.repo.fork }}
291+
name: test-multimodal-linux
291292
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
292293
permissions:
293294
id-token: write
294295
contents: read
296+
secrets: inherit
295297
strategy:
296298
fail-fast: false
299+
matrix:
300+
model: ["gemma3-4b", "llava"]
297301
with:
302+
secrets-env: EXECUTORCH_HF_TOKEN
298303
runner: linux.24xlarge
299304
docker-image: ci-image:executorch-ubuntu-22.04-clang12
300305
submodules: 'recursive'
@@ -305,17 +310,20 @@ jobs:
305310
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
306311
conda activate "${CONDA_ENV}"
307312
313+
echo "::group::Setup ExecuTorch"
308314
PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh --build-tool "cmake"
309-
310-
# install Llava requirements
311-
bash examples/models/llama/install_requirements.sh
312-
bash examples/models/llava/install_requirements.sh
313-
314-
# run python unittest
315-
python -m unittest examples.models.llava.test.test_llava
316-
317-
# run e2e (export, tokenizer and runner)
318-
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llava.sh
315+
echo "::endgroup::"
316+
317+
echo "::group::Setup Huggingface"
318+
pip install -U "huggingface_hub[cli]"
319+
huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
320+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
321+
pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
322+
echo "::endgroup::"
323+
324+
echo "::group::Test ${{ matrix.model }}"
325+
python .ci/scripts/test_huggingface_optimum_model.py --model ${{ matrix.model }} --quantize --recipe xnnpack
326+
echo "::endgroup::"
319327
320328
test-moshi-linux:
321329
name: test-moshi-linux

.github/workflows/trunk.yml

Lines changed: 35 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -616,34 +616,41 @@ jobs:
616616
617617
bash .ci/scripts/test_torchao_huggingface_checkpoints.sh ${{ matrix.model }} ${{ matrix.test_with_runner && '--test_with_runner' || '' }}
618618
619-
# # TODO(jackzhxng): Runner consistently runs out of memory before test finishes. Try to find a more powerful runner.
620-
# test-llava-runner-macos:
621-
# name: test-llava-runner-macos
622-
# uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
623-
# strategy:
624-
# fail-fast: false
625-
# with:
626-
# runner: macos-14-xlarge
627-
# python-version: '3.11'
628-
# submodules: 'recursive'
629-
# ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
630-
# timeout: 900
631-
# script: |
632-
# BUILD_TOOL=cmake
633-
634-
# bash .ci/scripts/setup-conda.sh
635-
# # Setup MacOS dependencies as there is no Docker support on MacOS atm
636-
# GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool "${BUILD_TOOL}"
637-
638-
# # install Llava requirements
639-
# ${CONDA_RUN} bash examples/models/llama/install_requirements.sh
640-
# ${CONDA_RUN} bash examples/models/llava/install_requirements.sh
641-
642-
# # run python unittest
643-
# ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava
644-
645-
# # run e2e (export, tokenizer and runner)
646-
# PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh
619+
test-multimodal-macos:
620+
if: ${{ !github.event.pull_request.head.repo.fork }}
621+
name: test-multimodal-macos
622+
uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
623+
permissions:
624+
id-token: write
625+
contents: read
626+
strategy:
627+
fail-fast: false
628+
matrix:
629+
model: ["gemma3-4b"] # llava is probably too big so not covering it here.
630+
with:
631+
secrets-env: EXECUTORCH_HF_TOKEN
632+
runner: macos-14-xlarge
633+
python-version: '3.11'
634+
submodules: 'recursive'
635+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
636+
timeout: 900
637+
script: |
638+
echo "::group::Setup ExecuTorch"
639+
bash .ci/scripts/setup-conda.sh
640+
# Setup MacOS dependencies as there is no Docker support on MacOS atm
641+
GITHUB_RUNNER=1 PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/setup-macos.sh --build-tool cmake
642+
echo "::endgroup::"
643+
644+
echo "::group::Setup Huggingface"
645+
${CONDA_RUN} pip install -U "huggingface_hub[cli]"
646+
${CONDA_RUN} huggingface-cli login --token $SECRET_EXECUTORCH_HF_TOKEN
647+
OPTIMUM_ET_VERSION=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
648+
${CONDA_RUN} pip install git+https://github.com/huggingface/optimum-executorch.git@${OPTIMUM_ET_VERSION}
649+
echo "::endgroup::"
650+
651+
echo "::group::Test ${{ matrix.model }}"
652+
${CONDA_RUN} python .ci/scripts/test_huggingface_optimum_model.py --model ${{ matrix.model }} --quantize --recipe xnnpack
653+
echo "::endgroup::"
647654
648655
test-qnn-model:
649656
name: test-qnn-model

examples/models/llava/install_requirements.sh

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,4 @@
77

88
set -x
99

10-
pip install transformers accelerate sentencepiece tiktoken
11-
12-
# Run llama2/install requirements for torchao deps
13-
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
14-
15-
bash "$SCRIPT_DIR"/../llama/install_requirements.sh
10+
pip install git+https://github.com/huggingface/optimum-executorch.git@d4d3046738ca31b5542506aaa76a28d540600227

extension/llm/runner/__init__.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,6 @@
1111
enabling processing of mixed inputs (text, images, audio) and text generation.
1212
"""
1313

14-
try:
15-
from PIL import Image as PILImage
16-
17-
HAS_PIL = True
18-
except ImportError:
19-
HAS_PIL = False
20-
2114
try:
2215
# Import shared components from the compiled C++ extension
2316
from executorch.extension.llm.runner._llm_runner import ( # noqa: F401
@@ -50,5 +43,3 @@
5043
"MultimodalRunner",
5144
"Stats",
5245
]
53-
54-
__version__ = "0.1.0"

extension/llm/runner/_llm_runner.pyi

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,18 @@ class GenerationConfig:
3434
num_eos: int
3535
"""Number of EOS tokens to add to the prompt."""
3636

37-
def __init__(self) -> None:
38-
"""Initialize GenerationConfig with default values."""
37+
def __init__(
38+
self,
39+
*,
40+
echo: bool = True,
41+
max_new_tokens: int = -1,
42+
warming: bool = False,
43+
seq_len: int = -1,
44+
temperature: float = 0.8,
45+
num_bos: int = 0,
46+
num_eos: int = 0,
47+
) -> None:
48+
"""Initialize GenerationConfig with optional keyword arguments for all fields."""
3949
...
4050

4151
def resolve_max_new_tokens(
@@ -360,6 +370,19 @@ class MultimodalRunner:
360370
"""
361371
...
362372

373+
def prefill(self, inputs: List[MultimodalInput]) -> None:
374+
"""
375+
Prefill multimodal inputs (e.g., to rebuild KV cache from chat history)
376+
without generating tokens.
377+
378+
Args:
379+
inputs: List of multimodal inputs to prefill
380+
381+
Raises:
382+
RuntimeError: If prefill fails
383+
"""
384+
...
385+
363386
def generate_text(
364387
self, inputs: List[MultimodalInput], config: GenerationConfig
365388
) -> str:

extension/llm/runner/multimodal_input.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ class ET_EXPERIMENTAL MultimodalInput {
3131
/// Type of multimodal input data
3232
enum class Type {
3333
TEXT, ///< Text string input
34-
TOKENS, ///< Pre-tokenized input (vector of token IDs)
34+
TOKENS, ///< Tokenizer encoded input (vector of token IDs)
3535
IMAGE, ///< Processed image input
3636
AUDIO, ///< Processed audio input
3737
RAW_AUDIO, ///< Raw unprocessed audio input (straight from audio file)

0 commit comments

Comments
 (0)