Skip to content

Commit b5fd482

Browse files
Update
[ghstack-poisoned]
2 parents 0e422bf + 37bdc0b commit b5fd482

File tree

158 files changed

+4018
-1645
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

158 files changed

+4018
-1645
lines changed

.ci/scripts/test_llama_lora.sh

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,17 @@ DOWNLOADED_PATH=$(
4848
--model_id "${HF_MODEL_REPO}" \
4949
--files "adapter_config.json" "adapter_model.pt" "consolidated.00.pth" "params.json" "tokenizer.model"
5050
)
51-
EXPORTED_MODEL_NAME="llama_3_2_1B_lora.pte"
52-
# Export model.
51+
# Build llama runner.
52+
cmake_install_executorch_libraries
53+
cmake_build_llama_runner
54+
55+
# Constants.
56+
RUNTIME_ARGS="--tokenizer_path=${DOWNLOADED_PATH}/tokenizer.model --temperature=0 --seq_len=20 --warmup=1"
57+
PROMPT="What happens if you eat watermelon seeds?"
58+
EXPECTED_PREFIX="What happens if you eat watermelon seeds? Watermelon seeds are a good source of vitamin C,"
59+
60+
# Export LoRA PTE file.
61+
MODEL_NAME="llama_3_2_1B_lora"
5362
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
5463
base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
5564
base.params="${DOWNLOADED_PATH}/params.json" \
@@ -61,36 +70,64 @@ $PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
6170
model.dtype_override="fp32" \
6271
backend.xnnpack.enabled=true \
6372
backend.xnnpack.extended_ops=true \
64-
export.output_name="${EXPORTED_MODEL_NAME}"
65-
66-
# Build llama runner.
67-
cmake_install_executorch_libraries
68-
cmake_build_llama_runner
73+
export.output_name="${MODEL_NAME}.pte"
6974

70-
PROMPT="What happens if you eat watermelon seeds?"
7175
# Run llama runner
72-
RUNTIME_ARGS="--model_path=${EXPORTED_MODEL_NAME} --tokenizer_path=${DOWNLOADED_PATH}/tokenizer.model --temperature=0 --seq_len=20 --warmup=1"
73-
7476
NOW=$(date +"%H:%M:%S")
7577
echo "Starting to run llama runner at ${NOW}"
7678
# shellcheck source=/dev/null
77-
cmake-out/examples/models/llama/llama_main --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt
79+
cmake-out/examples/models/llama/llama_main --model_path=${MODEL_NAME}.pte --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt
7880
NOW=$(date +"%H:%M:%S")
7981
echo "Finished at ${NOW}"
8082

8183
RESULT=$(cat result.txt)
82-
EXPECTED_PREFIX="What happens if you eat watermelon seeds? Watermelon seeds are a good source of vitamin C,"
83-
8484
if [[ "${RESULT}" == "${EXPECTED_PREFIX}"* ]]; then
8585
echo "Expected result prefix: ${EXPECTED_PREFIX}"
8686
echo "Actual result: ${RESULT}"
87+
# Do not clean up files if test passes, as they're re-used in the next test.
8788
echo "Success"
88-
cleanup_files
8989
else
9090
echo "Expected result prefix: ${EXPECTED_PREFIX}"
9191
echo "Actual result: ${RESULT}"
9292
echo "Failure; results not the same"
93+
cleanup_files
94+
exit 1
95+
fi
9396

97+
# Export LoRA PTE, PTD file.
98+
MODEL_SEPARATE="${MODEL_NAME}_separate"
99+
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
100+
base.checkpoint="${DOWNLOADED_PATH}/consolidated.00.pth" \
101+
base.params="${DOWNLOADED_PATH}/params.json" \
102+
base.adapter_checkpoint="${DOWNLOADED_PATH}/adapter_model.pt" \
103+
base.adapter_config="${DOWNLOADED_PATH}/adapter_config.json" \
104+
base.tokenizer_path="${DOWNLOADED_PATH}/tokenizer.model" \
105+
model.use_kv_cache=true \
106+
model.use_sdpa_with_kv_cache=true \
107+
model.dtype_override="fp32" \
108+
backend.xnnpack.enabled=true \
109+
backend.xnnpack.extended_ops=true \
110+
export.output_name="${MODEL_SEPARATE}.pte" \
111+
export.foundation_weights_file="${MODEL_SEPARATE}.ptd"
112+
113+
# Run llama runner.
114+
NOW=$(date +"%H:%M:%S")
115+
echo "Starting to run llama runner at ${NOW}"
116+
# shellcheck source=/dev/null
117+
cmake-out/examples/models/llama/llama_main --model_path=${MODEL_SEPARATE}.pte --data_path=${MODEL_SEPARATE}.ptd --prompt="${PROMPT}" ${RUNTIME_ARGS} > result2.txt
118+
NOW=$(date +"%H:%M:%S")
119+
echo "Finished at ${NOW}"
120+
121+
RESULT2=$(cat result2.txt)
122+
if [[ "${RESULT2}" == "${EXPECTED_PREFIX}"* ]]; then
123+
echo "Expected result prefix: ${EXPECTED_PREFIX}"
124+
echo "Actual result: ${RESULT2}"
125+
echo "Success"
126+
cleanup_files
127+
else
128+
echo "Expected result prefix: ${EXPECTED_PREFIX}"
129+
echo "Actual result: ${RESULT2}"
130+
echo "Failure; results not the same"
94131
cleanup_files
95132
exit 1
96133
fi

.github/workflows/build-presets.yml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@ on:
66
branches:
77
- main
88
- release/*
9-
paths:
10-
- .github/workflows/build-presets.yml
119
workflow_dispatch:
1210

1311
concurrency:

.github/workflows/pull.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,7 @@ jobs:
315315
bash examples/models/moshi/mimi/install_requirements.sh
316316
317317
# reinstall executorch
318-
bash ./install_executorch.sh
318+
bash ./install_executorch.sh --minimal
319319
320320
# run python unittest
321321
python -m unittest examples.models.moshi.mimi.test_mimi

.github/workflows/trunk.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ jobs:
288288
- test_arm_baremetal: test_models_tosa
289289
- test_arm_baremetal: test_models_ethos-u55
290290
- test_arm_baremetal: test_models_ethos-u85
291+
- test_arm_baremetal: test_smaller_stories_llama
291292
fail-fast: false
292293
with:
293294
runner: linux.2xlarge.memory

.lintrunner.toml

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,36 @@ init_command = [
136136
'--requirement=requirements-lintrunner.txt',
137137
]
138138

139+
[[linter]]
140+
code = 'CMAKEFORMAT'
141+
include_patterns = [
142+
"**/*.cmake",
143+
"**/*.cmake.in",
144+
"**/CMakeLists.txt",
145+
]
146+
exclude_patterns = [
147+
'third-party/**',
148+
'**/third-party/**',
149+
]
150+
command = [
151+
'python',
152+
'-m',
153+
'lintrunner_adapters',
154+
'run',
155+
'cmake_format_linter',
156+
'--',
157+
'@{{PATHSFILE}}',
158+
]
159+
init_command = [
160+
'python',
161+
'-m',
162+
'lintrunner_adapters',
163+
'run',
164+
'pip_init',
165+
'--dry-run={{DRYRUN}}',
166+
'--requirement=requirements-lintrunner.txt',
167+
]
168+
139169
[[linter]]
140170
code = 'ETCAPITAL'
141171
include_patterns = [

CMakeLists.txt

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -284,15 +284,19 @@ if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
284284
set(TORCHAO_BUILD_CPU_AARCH64 ON)
285285
set(TORCHAO_ENABLE_ARM_NEON_DOT ON)
286286

287-
list(APPEND TORCHAO_INCLUDE_DIRS
287+
list(
288+
APPEND
289+
TORCHAO_INCLUDE_DIRS
288290
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
289291
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
290292
${EXECUTORCH_ROOT}/third-party/ao
291293
)
292294

293295
set(EXECUTORCH_INCLUDE_DIRS ${TORCHAO_INCLUDE_DIRS})
294296

295-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental)
297+
add_subdirectory(
298+
${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental
299+
)
296300
executorch_target_link_options_shared_lib(torchao_ops_executorch)
297301
list(APPEND _executorch_kernels torchao_ops_executorch)
298302
endif()

backends/apple/coreml/recipes/coreml_recipe_provider.py

Lines changed: 18 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -59,11 +59,11 @@ def create_recipe(
5959
return self._build_fp_recipe(recipe_type, ct.precision.FLOAT32, **kwargs)
6060
elif recipe_type == CoreMLRecipeType.FP16:
6161
return self._build_fp_recipe(recipe_type, ct.precision.FLOAT16, **kwargs)
62-
elif recipe_type == CoreMLRecipeType.INT8_STATIC:
62+
elif recipe_type == CoreMLRecipeType.PT2E_INT8_STATIC:
6363
return self._build_pt2e_quantized_recipe(
6464
recipe_type, activation_dtype=torch.quint8, **kwargs
6565
)
66-
elif recipe_type == CoreMLRecipeType.INT8_WEIGHT_ONLY:
66+
elif recipe_type == CoreMLRecipeType.PT2E_INT8_WEIGHT_ONLY:
6767
return self._build_pt2e_quantized_recipe(
6868
recipe_type, activation_dtype=torch.float32, **kwargs
6969
)
@@ -201,6 +201,19 @@ def _validate_codebook_parameters(
201201
f"Parameter 'block_size' must be a list, got {type(block_size).__name__}: {block_size}"
202202
)
203203

204+
def _validate_and_set_deployment_target(
205+
self, kwargs: Any, min_target: ct.target, quantization_type: str
206+
) -> None:
207+
"""Validate or set minimum deployment target for quantization recipes"""
208+
minimum_deployment_target = kwargs.get("minimum_deployment_target", None)
209+
if minimum_deployment_target and minimum_deployment_target < min_target:
210+
raise ValueError(
211+
f"minimum_deployment_target must be {str(min_target)} or higher for {quantization_type} quantization"
212+
)
213+
else:
214+
# Default to the minimum target for this quantization type
215+
kwargs["minimum_deployment_target"] = min_target
216+
204217
def _build_fp_recipe(
205218
self,
206219
recipe_type: RecipeType,
@@ -227,13 +240,7 @@ def _build_pt2e_quantized_recipe(
227240
"""Build PT2E-based quantization recipe"""
228241
from executorch.backends.apple.coreml.quantizer import CoreMLQuantizer
229242

230-
minimum_deployment_target = kwargs.get("minimum_deployment_target", None)
231-
if minimum_deployment_target and minimum_deployment_target < ct.target.iOS17:
232-
raise ValueError(
233-
"minimum_deployment_target must be iOS17 or higher for codebook quantization"
234-
)
235-
# Default to iOS17 for quantization
236-
kwargs["minimum_deployment_target"] = ct.target.iOS17
243+
self._validate_and_set_deployment_target(kwargs, ct.target.iOS17, "pt2e")
237244

238245
# Validate activation_dtype
239246
assert activation_dtype in [
@@ -292,7 +299,7 @@ def _build_torchao_quantized_recipe(
292299
)
293300

294301
# override minimum_deployment_target to ios18 for torchao (GH issue #13122)
295-
kwargs["minimum_deployment_target"] = ct.target.iOS18
302+
self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "torchao")
296303
lowering_recipe = self._get_coreml_lowering_recipe(**kwargs)
297304

298305
return ExportRecipe(
@@ -313,13 +320,7 @@ def _build_codebook_quantized_recipe(
313320
CodebookWeightOnlyConfig,
314321
)
315322

316-
minimum_deployment_target = kwargs.get("minimum_deployment_target", None)
317-
if minimum_deployment_target and minimum_deployment_target < ct.target.iOS18:
318-
raise ValueError(
319-
"minimum_deployment_target must be iOS18 or higher for codebook quantization"
320-
)
321-
# Default to iOS18 for codebook quantization
322-
kwargs["minimum_deployment_target"] = ct.target.iOS18
323+
self._validate_and_set_deployment_target(kwargs, ct.target.iOS18, "codebook")
323324

324325
# Get the appropriate dtype (torch.uint1 through torch.uint8)
325326
dtype = getattr(torch, f"uint{bits}")

backends/apple/coreml/recipes/coreml_recipe_types.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
class CoreMLRecipeType(RecipeType):
1313
"""CoreML-specific generic recipe types"""
1414

15-
# All the recipes accept common kwargs
15+
## All the recipes accept common kwargs
1616
# 1. minimum_deployment_unit (default: None)
1717
# 2. compute_unit (default: ct.ComputeUnit.ALL)
1818

@@ -22,27 +22,28 @@ class CoreMLRecipeType(RecipeType):
2222
# FP16 precision recipe, defaults to values published by the CoreML backend and partitioner
2323
FP16 = "coreml_fp16"
2424

25-
# PT2E-based quantization recipes
25+
## PT2E-based quantization recipes
2626
# INT8 Static Quantization (weights + activations), requires calibration dataset
27-
INT8_STATIC = "coreml_int8_static"
27+
PT2E_INT8_STATIC = "coreml_pt2e_int8_static"
2828
# INT8 Weight-only Quantization (activations remain FP32)
29-
INT8_WEIGHT_ONLY = "coreml_int8_weight_only"
29+
PT2E_INT8_WEIGHT_ONLY = "coreml_pt2e_int8_weight_only"
3030

31-
# TorchAO-based quantization recipes
31+
## TorchAO-based quantization recipes
3232
# All TorchAO recipes accept filter_fn kwarg to control which layers are quantized
3333
# INT4 Weight-only Quantization, per-channel (axis=0)
34-
# Additional kwargs: filter_fn (default: None - quantizes all applicable layers)
34+
# Additional kwargs: filter_fn (default: None - quantizes linear layers)
3535
INT4_WEIGHT_ONLY_PER_CHANNEL = "coreml_int4_weight_only_per_channel"
3636
# INT4 Weight-only Quantization, per-group
37-
# Additional kwargs: group_size (default: 32), filter_fn (default: None - quantizes all applicable layers)
37+
# Additional kwargs: group_size (default: 32), filter_fn (default: None - quantizes linear layers)
3838
INT4_WEIGHT_ONLY_PER_GROUP = "coreml_int4_weight_only_per_group"
3939
# INT8 Weight-only Quantization, per-channel (axis=0)
40-
# Additional kwargs: filter_fn (default: None - quantizes all applicable layers)
40+
# Additional kwargs: filter_fn (default: None - quantizes linear layers)
4141
INT8_WEIGHT_ONLY_PER_CHANNEL = "coreml_int8_weight_only_per_channel"
4242
# INT8 Weight-only Quantization, per-group
43-
# Additional kwargs: group_size (default: 32), filter_fn (default: None - quantizes all applicable layers)
43+
# Additional kwargs: group_size (default: 32), filter_fn (default: None - quantizes linear layers)
4444
INT8_WEIGHT_ONLY_PER_GROUP = "coreml_int8_weight_only_per_group"
45-
# Codebook/Palettization Quantization
45+
46+
## Codebook/Palettization Quantization
4647
# Additional kwargs: bits (1-8, default: 3), block_size (default: [-1, 16]),
4748
# filter_fn (default: targets Linear and Embedding layers only)
4849
CODEBOOK_WEIGHT_ONLY = "coreml_codebook_weight_only"

0 commit comments

Comments
 (0)