Skip to content

Commit 4d246be

Browse files
author
morelos
committed
Update on "[ET-VK] double, short, and uint16 dtype runtime support"
Creating support for double, short, and uint16 for quantization ops. Registering the short keyword since theres already support. Also changing the cpu implementation to support half Differential Revision: [D75959063](https://our.internmc.facebook.com/intern/diff/D75959063/) [ghstack-poisoned]
2 parents 4121e3e + 8af4f4d commit 4d246be

File tree

112 files changed

+2922
-6434
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

112 files changed

+2922
-6434
lines changed

.ci/scripts/build-mediatek-sdk.sh

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -eux
9+
10+
build_neuron_backend() {
11+
echo "Start building neuron backend."
12+
export ANDROID_NDK=/opt/ndk
13+
export MEDIATEK_SDK_ROOT=/tmp/neuropilot
14+
export NEURON_BUFFER_ALLOCATOR_LIB=${MEDIATEK_SDK_ROOT}/libneuron_buffer_allocator.so
15+
export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
16+
17+
18+
cd ${EXECUTORCH_ROOT}
19+
./backends/mediatek/scripts/mtk_build.sh
20+
}
21+
22+
build_neuron_backend

.ci/scripts/gather_benchmark_configs.py

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@
3232
BENCHMARK_CONFIGS = {
3333
"xplat": [
3434
"xnnpack_q8",
35-
"hf_xnnpack_fp32",
35+
"hf_xnnpack_custom_spda_kv_cache_8da4w",
36+
"et_xnnpack_custom_spda_kv_cache_8da4w",
3637
"llama3_fb16",
3738
"llama3_spinquant",
3839
"llama3_qlora",
@@ -129,25 +130,25 @@ def generate_compatible_configs(model_name: str, target_os=None) -> List[str]:
129130
"""
130131
configs = []
131132
if is_valid_huggingface_model_id(model_name):
133+
configs.append("hf_xnnpack_custom_spda_kv_cache_8da4w")
132134
if model_name.startswith("meta-llama/"):
133-
# LLaMA models
135+
# etLLM recipes for Llama
134136
repo_name = model_name.split("meta-llama/")[1]
135137
if "qlora" in repo_name.lower():
136-
configs.append("llama3_qlora")
138+
configs = ["llama3_qlora"]
137139
elif "spinquant" in repo_name.lower():
138-
configs.append("llama3_spinquant")
140+
configs = ["llama3_spinquant"]
139141
else:
140-
configs.append("llama3_fb16")
142+
configs.extend(["llama3_fb16", "et_xnnpack_custom_spda_kv_cache_8da4w"])
141143
configs.extend(
142144
[
143145
config
144146
for config in BENCHMARK_CONFIGS.get(target_os, [])
145147
if config.startswith("llama")
146148
]
147149
)
148-
else:
149-
# Non-LLaMA models
150-
configs.append("hf_xnnpack_fp32")
150+
if model_name.startswith("Qwen/Qwen3"):
151+
configs.append("et_xnnpack_custom_spda_kv_cache_8da4w")
151152
elif model_name in MODEL_NAME_TO_MODEL:
152153
# ExecuTorch in-tree non-GenAI models
153154
configs.append("xnnpack_q8")

.ci/scripts/setup-mediatek-deps.sh

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -eux
9+
10+
MEDIATEK_INSTALLATION_DIR=/tmp/neuropilot
11+
EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
12+
13+
install_neuropilot() {
14+
echo "Start installing neuropilot."
15+
mkdir -p "${MEDIATEK_INSTALLATION_DIR}"
16+
17+
curl -Lo /tmp/neuropilot-express.tar.gz "https://s3.ap-southeast-1.amazonaws.com/mediatek.neuropilot.com/06302508-4c94-4bf2-9789-b0ee44e83e27.gz"
18+
echo "Finishing downloading neuropilot sdk."
19+
tar zxvf /tmp/neuropilot-express.tar.gz --strip-components=1 --directory "${MEDIATEK_INSTALLATION_DIR}"
20+
echo "Finishing unzip neuropilot sdk."
21+
22+
# Copy NP header
23+
cp ${MEDIATEK_INSTALLATION_DIR}/api/NeuronAdapter.h ${EXECUTORCH_ROOT}/backends/mediatek/runtime/include/api/
24+
25+
# Print the content for manual verification
26+
ls -lah "${MEDIATEK_INSTALLATION_DIR}"
27+
}
28+
29+
setup_neuropilot() {
30+
pip3 install -r ${EXECUTORCH_ROOT}/backends/mediatek/requirements.txt
31+
pip3 install ${MEDIATEK_INSTALLATION_DIR}/mtk_neuron-8.2.19-py3-none-linux_x86_64.whl
32+
pip3 install ${MEDIATEK_INSTALLATION_DIR}/mtk_converter-8.13.0_public_packages/mtk_converter-8.13.0+public-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
33+
}
34+
35+
setup_calibration_data() {
36+
curl -Lo /tmp/imagenette2-160.tgz https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz
37+
tar zxvf /tmp/imagenette2-160.tgz --strip-components=1 --directory "${MEDIATEK_INSTALLATION_DIR}"
38+
}
39+
40+
install_neuropilot
41+
setup_neuropilot
42+
setup_calibration_data

.ci/scripts/test_llava.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -147,7 +147,7 @@ run_and_verify() {
147147

148148
# verify result.txt
149149
RESULT=$(cat result.txt)
150-
EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with several players on the court. "
150+
EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with"
151151

152152
if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
153153
echo "Expected result prefix: ${EXPECTED_PREFIX}"

.ci/scripts/test_model.sh

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -244,6 +244,24 @@ test_model_with_mps() {
244244
EXPORTED_MODEL=$(find "." -type f -name "${MODEL_NAME}*.pte" -print -quit)
245245
}
246246

247+
test_model_with_mediatek() {
248+
if [[ "${MODEL_NAME}" == "dl3" ]]; then
249+
EXPORT_SCRIPT=deeplab_v3
250+
elif [[ "${MODEL_NAME}" == "mv3" ]]; then
251+
EXPORT_SCRIPT=mobilenet_v3
252+
elif [[ "${MODEL_NAME}" == "mv2" ]]; then
253+
EXPORT_SCRIPT=mobilenet_v2
254+
elif [[ "${MODEL_NAME}" == "ic4" ]]; then
255+
EXPORT_SCRIPT=inception_v4
256+
elif [[ "${MODEL_NAME}" == "ic3" ]]; then
257+
EXPORT_SCRIPT=inception_v3
258+
fi
259+
260+
PYTHONPATH=examples/mediatek/ "${PYTHON_EXECUTABLE}" -m examples.mediatek.model_export_scripts.${EXPORT_SCRIPT} -d /tmp/neuropilot/train -a ${EXPORT_SCRIPT}
261+
EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "*.pte" -print -quit)
262+
}
263+
264+
247265
if [[ "${BACKEND}" == "portable" ]]; then
248266
echo "Testing ${MODEL_NAME} with portable kernels..."
249267
test_model
@@ -281,6 +299,12 @@ elif [[ "${BACKEND}" == *"xnnpack"* ]]; then
281299
if [[ $? -eq 0 ]]; then
282300
prepare_artifacts_upload
283301
fi
302+
elif [[ "${BACKEND}" == "mediatek" ]]; then
303+
echo "Testing ${MODEL_NAME} with mediatek..."
304+
test_model_with_mediatek
305+
if [[ $? -eq 0 ]]; then
306+
prepare_artifacts_upload
307+
fi
284308
else
285309
set +e
286310
if [[ "${BACKEND}" == *"quantization"* ]]; then

.ci/scripts/tests/test_gather_benchmark_configs.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,24 @@ def test_generate_compatible_configs_llama_model(self):
112112
result = self.gather_benchmark_configs.generate_compatible_configs(
113113
model_name, target_os
114114
)
115-
expected = ["llama3_fb16", "llama3_coreml_ane"]
116-
self.assertEqual(result, expected)
115+
expected = [
116+
"llama3_fb16",
117+
"llama3_coreml_ane",
118+
"et_xnnpack_custom_spda_kv_cache_8da4w",
119+
"hf_xnnpack_custom_spda_kv_cache_8da4w",
120+
]
121+
self.assertCountEqual(result, expected)
117122

118123
target_os = "android"
119124
result = self.gather_benchmark_configs.generate_compatible_configs(
120125
model_name, target_os
121126
)
122-
expected = ["llama3_fb16"]
123-
self.assertEqual(result, expected)
127+
expected = [
128+
"llama3_fb16",
129+
"et_xnnpack_custom_spda_kv_cache_8da4w",
130+
"hf_xnnpack_custom_spda_kv_cache_8da4w",
131+
]
132+
self.assertCountEqual(result, expected)
124133

125134
def test_generate_compatible_configs_quantized_llama_model(self):
126135
model_name = "meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8"

.ci/scripts/unittest-buck2.sh

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ buck2 query "//backends/apple/... + //backends/example/... + \
1515
//kernels/optimized/... + //kernels/portable/... + //kernels/quantized/... + \
1616
//kernels/test/... + //runtime/... + //schema/... + //test/... + //util/..."
1717

18+
# TODO: optimized ops are unbuildable because they now use ATen; put
19+
# them back after we can use PyTorch in OSS buck.
1820
UNBUILDABLE_OPTIMIZED_OPS_REGEX="_elu|gelu|fft|log_softmax"
19-
BUILDABLE_OPTIMIZED_OPS=$(buck2 query //kernels/optimized/cpu/... | grep -E -v $UNBUILDABLE_OPTIMIZED_OPS_REGEX)
21+
BUILDABLE_OPTIMIZED_OPS= #$(buck2 query //kernels/optimized/cpu/... | grep -E -v $UNBUILDABLE_OPTIMIZED_OPS_REGEX)
2022

2123
# TODO: build prim_ops_test_cpp again once supported_features works in
2224
# OSS buck.

.github/workflows/android-perf-private-device-experiment.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ on:
1818
description: Models to be benchmarked
1919
required: false
2020
type: string
21-
default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
21+
default: Qwen/Qwen3-0.6B
2222
devices:
2323
description: Target devices to run benchmark
2424
required: false
@@ -34,7 +34,7 @@ on:
3434
description: Models to be benchmarked
3535
required: false
3636
type: string
37-
default: mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8
37+
default: Qwen/Qwen3-0.6B
3838
devices:
3939
description: Target devices to run benchmark
4040
required: false
@@ -57,6 +57,6 @@ jobs:
5757
id-token: write
5858
contents: read
5959
with:
60-
models: ${{ inputs.models || 'mv3,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' }}
60+
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
6161
devices: samsung_galaxy_s22_private
6262
benchmark_configs: ${{ inputs.benchmark_configs }}

.github/workflows/android-perf.yml

Lines changed: 86 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@ on:
66
pull_request:
77
paths:
88
- .github/workflows/android-perf.yml
9+
- .ci/scripts/gather_benchmark_configs.py
910
- extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
1011
push:
1112
branches:
1213
- main
1314
paths:
1415
- .github/workflows/android-perf.yml
16+
- .ci/scripts/gather_benchmark_configs.py
1517
- extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
1618
# Note: GitHub has an upper limit of 10 inputs
1719
workflow_dispatch:
@@ -20,7 +22,7 @@ on:
2022
description: Models to be benchmarked
2123
required: false
2224
type: string
23-
default: llama
25+
default: Qwen/Qwen3-0.6B
2426
devices:
2527
description: Target devices to run benchmark
2628
required: false
@@ -36,7 +38,7 @@ on:
3638
description: Models to be benchmarked
3739
required: false
3840
type: string
39-
default: llama
41+
default: Qwen/Qwen3-0.6B
4042
devices:
4143
description: Target devices to run benchmark
4244
required: false
@@ -70,7 +72,7 @@ jobs:
7072
# Separate default values from the workflow dispatch. To ensure defaults are accessible
7173
# during scheduled runs and to provide flexibility for different defaults between
7274
# on-demand and periodic benchmarking.
73-
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'llama,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8' || 'llama' }}
75+
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
7476
CRON_DEFAULT_DEVICES: samsung_galaxy_s22
7577
run: |
7678
set -eux
@@ -201,8 +203,8 @@ jobs:
201203
HF_MODEL_REPO=${{ matrix.model }}
202204
OUT_ET_MODEL_NAME="$(echo "$HF_MODEL_REPO" | awk -F'/' '{print $2}' | sed 's/_/-/g' | tr '[:upper:]' '[:lower:]')_${{ matrix.config }}"
203205
206+
# Convert HF checkpoint to ET via etLLM path
204207
if [[ "$HF_MODEL_REPO" == meta-llama/* ]]; then
205-
# Llama models on Hugging Face
206208
if [[ ${{ matrix.config }} == "llama3_spinquant" ]]; then
207209
# SpinQuant
208210
# Download prequantized chceckpoint from Hugging Face
@@ -272,6 +274,21 @@ jobs:
272274
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
273275
--output_name="${OUT_ET_MODEL_NAME}.pte"
274276
ls -lh "${OUT_ET_MODEL_NAME}.pte"
277+
elif [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
278+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model" "params.json" "consolidated.00.pth")
279+
python -m examples.models.llama.export_llama \
280+
--model llama3_2 \
281+
--checkpoint "${DOWNLOADED_PATH}/consolidated.00.pth" \
282+
--params "${DOWNLOADED_PATH}/params.json" \
283+
-kv \
284+
--use_sdpa_with_kv_cache \
285+
-d fp32 \
286+
-X \
287+
--xnnpack-extended-ops \
288+
-qmode 8da4w -G 32 -E 8,0 \
289+
--metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}' \
290+
--output_name="${OUT_ET_MODEL_NAME}.pte"
291+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
275292
elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
276293
export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
277294
export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
@@ -292,21 +309,75 @@ jobs:
292309
OUT_ET_MODEL_NAME="llama3_2_qnn" # Qualcomm hard-coded it in their script
293310
find . -name "${OUT_ET_MODEL_NAME}.pte" -not -path "./${OUT_ET_MODEL_NAME}.pte" -exec mv {} ./ \;
294311
ls -lh "${OUT_ET_MODEL_NAME}.pte"
295-
else
296-
# By default, test with the Hugging Face model and the xnnpack recipe
297-
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "original" --files "tokenizer.model")
298-
python -m extension.export_util.export_hf_model -hfm="$HF_MODEL_REPO" -o "$OUT_ET_MODEL_NAME"
299-
ls -lh "${OUT_ET_MODEL_NAME}.pte"
300312
fi
301-
else
302-
echo "Unsupported model ${{ matrix.model }}"
303-
exit 1
313+
elif [[ "$HF_MODEL_REPO" == "Qwen/Qwen3-0.6B" ]]; then
314+
if [[ ${{ matrix.config }} == "et_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
315+
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
316+
python -m examples.models.llama.export_llama \
317+
--model qwen3-0_6b \
318+
--params examples/models/qwen3/0_6b_config.json \
319+
-kv \
320+
--use_sdpa_with_kv_cache \
321+
-d fp32 \
322+
-X \
323+
--xnnpack-extended-ops \
324+
-qmode 8da4w \
325+
-G 32 \
326+
-E 8,0 \
327+
--metadata '{"get_bos_id": 151644, "get_eos_ids":[151645]}' \
328+
--output_name="${OUT_ET_MODEL_NAME}.pte"
329+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
330+
fi
331+
fi
332+
333+
if [[ ${{ matrix.config }} == "hf_xnnpack_custom_spda_kv_cache_8da4w" ]]; then
334+
DOWNLOADED_PATH=$(
335+
bash .ci/scripts/download_hf_hub.sh \
336+
--model_id "${HF_MODEL_REPO}" \
337+
--files "tokenizer.json"
338+
)
339+
echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
340+
341+
# Install optimum-executorch
342+
git clone https://github.com/huggingface/optimum-executorch
343+
pushd optimum-executorch
344+
# There is no release yet, for CI stability, always test from the same commit on main
345+
git checkout 1c653dc49812fc431a22312c7295d97005d22e12
346+
python install_dev.py
347+
pip list
348+
349+
ARGS=(
350+
"--model" "${HF_MODEL_REPO}"
351+
"--task" "text-generation"
352+
"--recipe" "xnnpack"
353+
"--use_custom_sdpa"
354+
"--qlinear"
355+
"--qembedding"
356+
"--output_dir" ".."
357+
)
358+
359+
# Add conditional arguments based on model
360+
case "${HF_MODEL_REPO}" in
361+
*"google/gemma-3-1b-it"*)
362+
echo "--use_custom_kv_cache can not be used for HybridCache"
363+
;;
364+
*)
365+
ARGS+=("--use_custom_kv_cache")
366+
;;
367+
esac
368+
369+
optimum-cli export executorch "${ARGS[@]}"
370+
popd
371+
372+
mv model.pte ${OUT_ET_MODEL_NAME}.pte
373+
ls -lh "${OUT_ET_MODEL_NAME}.pte"
304374
fi
305375
306-
zip -j model.zip "${OUT_ET_MODEL_NAME}.pte" "${DOWNLOADED_PATH}/tokenizer.model"
376+
zip -j model.zip ${OUT_ET_MODEL_NAME}.pte ${DOWNLOADED_PATH}/tokenizer.*
307377
ls -lh model.zip
308-
mkdir -p "${ARTIFACTS_DIR_NAME}"
309-
mv model.zip "${ARTIFACTS_DIR_NAME}"
378+
mkdir -p ${ARTIFACTS_DIR_NAME}
379+
mv model.zip ${ARTIFACTS_DIR_NAME}
380+
ls -lh ${ARTIFACTS_DIR_NAME}
310381
elif [[ ${{ matrix.model }} == "llama" ]]; then
311382
# Install requirements for export_llama
312383
PYTHON_EXECUTABLE=python bash examples/models/llama/install_requirements.sh

0 commit comments

Comments
 (0)