Skip to content

Commit b450223

Browse files
author
morelos
committed
Update on "[ET-VK][Ops] linear_qta8a_qga4w_qta8o test framework"
# Context This test framework establishes the foundation for validating the `linear_qta8a_qga4w_qta8o` operator implementation as part of enabling dynamic quantization. The motivation stems from advancing beyond weight-only quantization to full activation and weight quantized linear operations, enabling true integer arithmetic throughout the matrix multiplication process for improved performance on GPU hardware. The current weight-only quantized linear implementations in ET-VK dequantize weights to floating point before computation, missing the performance benefits of integer arithmetic. This operator nomenclature breakdown: - **qta8a**: Quantized per-token affine 8-bit activation inputs - **qga4w**: Quantized per-group affine 4-bit weights - **qta8o**: Quantized per-token affine 8-bit outputs # Changes The reference implementation (`linear_qta8a_qga4w_qta8o_4bit_dequant_impl`) provides a baseline for validating the GPU shader implementation through a deliberately simplified computation path. The quantized int8 input tensor is dequantized using the standard affine transformation `(quantized_input.to(at::kFloat) - input_zero_point) * input_scale`. After dequantization, the implementation performs standard floating point linear operation `at::linear(x_float, weights_dequantized)`, then manually quantizes the result using `at::round(linear_result / output_scale) + output_zero_point` with clamping to the int8 range [-128,127]. This two-stage approach of dequantize → compute → quantize provides a clear reference against which the GPU's integer arithmetic implementation can be validated. Differential Revision: [D77173442](https://our.internmc.facebook.com/intern/diff/D77173442/) [ghstack-poisoned]
2 parents 2e1cbf0 + c1920b3 commit b450223

File tree

307 files changed

+8292
-2951
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

307 files changed

+8292
-2951
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
a3942627f5ac048e06b4b1d703b0a6a53bf6da5b
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
5616fa4a68718ead203314a3467f7dd9547153ae
1+
9b498d3bb28b8e3411ce464dd2755c5b96d92c8f

.ci/docker/common/install_conda.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
1313
install_miniconda() {
1414
BASE_URL="https://repo.anaconda.com/miniconda"
1515
CONDA_FILE="Miniconda3-py${PYTHON_VERSION//./}_${MINICONDA_VERSION}-Linux-x86_64.sh"
16-
if [[ $(uname -m) == "aarch64" ]]; then
16+
if [[ $(uname -m) == "aarch64" ]]; then
1717
CONDA_FILE="Miniconda3-py${PYTHON_VERSION//./}_${MINICONDA_VERSION}-Linux-aarch64.sh"
1818
fi
1919

@@ -71,4 +71,8 @@ fix_conda_ubuntu_libstdcxx() {
7171
install_miniconda
7272
install_python
7373
install_pip_dependencies
74-
fix_conda_ubuntu_libstdcxx
74+
# Hack breaks the job on aarch64 but is still necessary everywhere
75+
# else.
76+
if [ "$(uname -m)" != "aarch64" ]; then
77+
fix_conda_ubuntu_libstdcxx
78+
fi

.ci/docker/conda-env-ci.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cmake=3.26.4
1+
cmake=3.31.2
22
ninja=1.10.2
33
libuv
44
llvm-openmp

.ci/scripts/check_c10_sync.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,4 @@ pushd pytorch
1212
git checkout "$pytorch_pin"
1313
popd
1414
"$(dirname "${BASH_SOURCE[0]}")"/compare_dirs.sh runtime/core/portable_type/c10/c10 pytorch/c10
15+
"$(dirname "${BASH_SOURCE[0]}")"/compare_dirs.sh runtime/core/portable_type/c10/torch/standalone pytorch/torch/standalone

.ci/scripts/test_model.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,15 +102,15 @@ test_model() {
102102
bash examples/models/llama/install_requirements.sh
103103
# Test export_llm script: python3 -m extension.llm.export.export_llm.
104104
# Use Llama random checkpoint with Qwen 2.5 1.5b model configuration.
105-
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/1_5b_config.json
105+
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/qwen2_5/config/1_5b_config.json
106106
rm "./${MODEL_NAME}.pte"
107107
return # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
108108
fi
109109
if [[ "${MODEL_NAME}" == "phi_4_mini" ]]; then
110110
# Install requirements for export_llama
111111
bash examples/models/llama/install_requirements.sh
112112
# Test export_llm script: python3 -m extension.llm.export.export_llm.
113-
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config.json
113+
"${PYTHON_EXECUTABLE}" -m extension.llm.export.export_llm base.model_class="${MODEL_NAME}" base.params=examples/models/phi_4_mini/config/config.json
114114
run_portable_executor_runner
115115
rm "./${MODEL_NAME}.pte"
116116
return

.github/workflows/android-perf-private-device-experiment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,6 @@ jobs:
5757
id-token: write
5858
contents: read
5959
with:
60-
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
60+
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }}
6161
devices: samsung_galaxy_s22_private
6262
benchmark_configs: ${{ inputs.benchmark_configs }}

.github/workflows/android-perf.yml

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ jobs:
7272
# Separate default values from the workflow dispatch. To ensure defaults are accessible
7373
# during scheduled runs and to provide flexibility for different defaults between
7474
# on-demand and periodic benchmarking.
75-
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
75+
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
7676
CRON_DEFAULT_DEVICES: samsung_galaxy_s22
7777
run: |
7878
set -eux
@@ -317,7 +317,7 @@ jobs:
317317
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
318318
python -m extension.llm.export.export_llm \
319319
base.model_class=qwen3_0_6b \
320-
base.params=examples/models/qwen3/0_6b_config.json \
320+
base.params=examples/models/qwen3/config/0_6b_config.json \
321321
model.use_kv_cache=true \
322322
model.use_sdpa_with_kv_cache=true \
323323
model.dtype_override=fp32 \
@@ -341,10 +341,11 @@ jobs:
341341
echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
342342
343343
# Install optimum-executorch
344+
OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
344345
git clone https://github.com/huggingface/optimum-executorch
345346
pushd optimum-executorch
346347
# There is no release yet, for CI stability, always test from the same commit on main
347-
git checkout 4c3b18f6cca68c5ccff809131d570062723d7188
348+
git checkout $OPTIMUM_ET_COMMIT
348349
python install_dev.py --skip_override_torch
349350
pip list
350351
@@ -353,21 +354,12 @@ jobs:
353354
"--task" "text-generation"
354355
"--recipe" "xnnpack"
355356
"--use_custom_sdpa"
357+
"--use_custom_kv_cache"
356358
"--qlinear"
357359
"--qembedding"
358360
"--output_dir" ".."
359361
)
360362
361-
# Add conditional arguments based on model
362-
case "${HF_MODEL_REPO}" in
363-
*"google/gemma-3-1b-it"*)
364-
echo "--use_custom_kv_cache can not be used for HybridCache"
365-
;;
366-
*)
367-
ARGS+=("--use_custom_kv_cache")
368-
;;
369-
esac
370-
371363
optimum-cli export executorch "${ARGS[@]}"
372364
popd
373365

.github/workflows/apple-perf-private-device-experiment.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,6 @@ jobs:
5757
id-token: write
5858
contents: read
5959
with:
60-
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
60+
models: ${{ inputs.models || github.event_name == 'schedule' && 'Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'google/gemma-3-1b-it' }}
6161
devices: apple_iphone_15_private
6262
benchmark_configs: ${{ inputs.benchmark_configs }}

.github/workflows/apple-perf.yml

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ jobs:
7272
# Separate default values from the workflow dispatch. To ensure defaults are accessible
7373
# during scheduled runs and to provide flexibility for different defaults between
7474
# on-demand and periodic benchmarking.
75-
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf' || 'Qwen/Qwen3-0.6B' }}
75+
CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l,meta-llama/Llama-3.2-1B-Instruct-SpinQuant_INT4_EO8,meta-llama/Llama-3.2-1B-Instruct-QLORA_INT4_EO8,Qwen/Qwen3-0.6B,HuggingFaceTB/SmolLM2-135M,meta-llama/Llama-3.2-1B,allenai/OLMo-1B-hf,google/gemma-3-1b-it' || 'Qwen/Qwen3-0.6B' }}
7676
CRON_DEFAULT_DEVICES: apple_iphone_15
7777
run: |
7878
set -eux
@@ -322,7 +322,7 @@ jobs:
322322
DOWNLOADED_PATH=$(bash .ci/scripts/download_hf_hub.sh --model_id "${HF_MODEL_REPO}" --subdir "." --files "tokenizer.json")
323323
${CONDA_RUN} python -m extension.llm.export.export_llm \
324324
base.model_class=qwen3_0_6b \
325-
base.params=examples/models/qwen3/0_6b_config.json \
325+
base.params=examples/models/qwen3/config/0_6b_config.json \
326326
model.use_kv_cache=true \
327327
model.use_sdpa_with_kv_cache=true \
328328
model.dtype_override=fp32 \
@@ -346,10 +346,11 @@ jobs:
346346
echo "tokenizer.json is downloaded to $DOWNLOADED_PATH"
347347
348348
# Install optimum-executorch
349+
OPTIMUM_ET_COMMIT=$(cat .ci/docker/ci_commit_pins/optimum-executorch.txt)
349350
git clone https://github.com/huggingface/optimum-executorch
350351
pushd optimum-executorch
351352
# There is no release yet, for CI stability, always test from the same commit on main
352-
git checkout 4c3b18f6cca68c5ccff809131d570062723d7188
353+
git checkout $OPTIMUM_ET_COMMIT
353354
${CONDA_RUN} python install_dev.py --skip_override_torch
354355
pip list
355356
@@ -358,21 +359,12 @@ jobs:
358359
"--task" "text-generation"
359360
"--recipe" "xnnpack"
360361
"--use_custom_sdpa"
362+
"--use_custom_kv_cache"
361363
"--qlinear"
362364
"--qembedding"
363365
"--output_dir" ".."
364366
)
365367
366-
# Add conditional arguments based on model
367-
case "${HF_MODEL_REPO}" in
368-
*"google/gemma-3-1b-it"*)
369-
echo "--use_custom_kv_cache can not be used for HybridCache"
370-
;;
371-
*)
372-
ARGS+=("--use_custom_kv_cache")
373-
;;
374-
esac
375-
376368
${CONDA_RUN} optimum-cli export executorch "${ARGS[@]}"
377369
popd
378370

0 commit comments

Comments
 (0)