Skip to content

Commit f3f7b12

Browse files
committed
Update base for Update on "[ET-VK][qlinear] Faster weight only quantized linear gemv kernel"
## Changes * Introduce a new compute shader for int4 linear's gemv cases that performs much better than the existing shader. This shader is inspired from MNN's gemv_1x1_conv_buf.cl shader. With this compute kernel, transformer models' text generation can execute much faster than before. On Samsung Galaxy S24 for Llama 3.2 1B, generating 128 tokens: Before: ~25 tok/s After: ~49 tok/s ## Why this new shader is faster The biggest reason is due to vectorized loading of the uint4 weight buffer. This new shader loads the weight buffer as a buffer/image of `uvec4`, whereas the old shader loads the weight buffer as a buffer/image of `u8vec4`. Using the Adreno Offline Compiler, I found that in the former, only one load instruction was used to load from the weight tensor, whereas in the latter 16 load instructions were used to load from the weight tensor. It appears that the data loading was not being vectorized at the assembly level. This is potentially behaviour that can be approved in the SPIR-V shader compiler. An additional factor is better weight packing layout. The new prepacking routine results in better memory coalescing between threads in a work group. The final major factor is the use of tree based reduction to co-operatively reduce partial results into the final output. Previously, a single thread was responsible for the final reduction. ## Future Work * Introduce faster shader for int4 linear gemm cases * Update QCSNW to also use these updated shaders Differential Revision: [D78275584](https://our.internmc.facebook.com/intern/diff/D78275584/) [ghstack-poisoned]
2 parents ac53ab0 + 924bbf8 commit f3f7b12

File tree

156 files changed

+6236
-855
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

156 files changed

+6236
-855
lines changed

.ci/scripts/test_phi_3_mini.sh

Lines changed: 5 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -22,31 +22,14 @@ NPROC=8
2222
if hash nproc &> /dev/null; then NPROC=$(nproc); fi
2323

2424
cmake_install_executorch_libraries() {
25-
cmake -DPYTHON_EXECUTABLE=python \
26-
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
27-
-DEXECUTORCH_ENABLE_LOGGING=1 \
28-
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
29-
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
30-
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
31-
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
32-
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
33-
-DEXECUTORCH_BUILD_XNNPACK=ON \
34-
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
35-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
36-
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
37-
-B${BUILD_DIR} .
38-
39-
cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
25+
rm -rf cmake-out
26+
cmake --preset llm -DCMAKE_INSTALL_PREFIX=cmake-out -DCMAKE_BUILD_TYPE=${BUILD_TYPE}
27+
cmake --build cmake-out -j16 --target install --config ${BUILD_TYPE}
4028
}
4129

4230
cmake_build_phi_3_mini() {
43-
cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
44-
-DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
31+
cmake -DCMAKE_PREFIX_PATH=${BUILD_DIR} \
4532
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
46-
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
47-
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
48-
-DEXECUTORCH_BUILD_XNNPACK=ON \
49-
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
5033
-B${BUILD_DIR}/${MODEL_DIR} \
5134
${MODEL_DIR}
5235

@@ -81,7 +64,7 @@ run_and_verify() {
8164
${BUILD_DIR}/${MODEL_DIR}/phi_3_mini_runner \
8265
--model_path=phi-3-mini.pte \
8366
--tokenizer_path=tokenizer.bin \
84-
--seq_len=128 \
67+
--seq_len=60 \
8568
--temperature=0 \
8669
--prompt="<|system|>
8770
You are a helpful assistant.<|end|>

.github/workflows/build-wheels-linux.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ on:
99
- examples/**/*
1010
- pyproject.toml
1111
- setup.py
12+
tags:
13+
- ciflow/binaries/*
1214
push:
1315
branches:
1416
- nightly

.github/workflows/build-wheels-macos.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ on:
99
- examples/**/*
1010
- pyproject.toml
1111
- setup.py
12+
tags:
13+
- ciflow/binaries/*
1214
push:
1315
branches:
1416
- nightly

.github/workflows/pull.yml

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -603,7 +603,7 @@ jobs:
603603
bash examples/models/phi-3-mini/install_requirements.sh
604604
605605
# run e2e (export, tokenizer and runner)
606-
PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh
606+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_phi_3_mini.sh Release
607607
608608
test-eval_llama-wikitext-linux:
609609
name: test-eval_llama-wikitext-linux
@@ -762,3 +762,66 @@ jobs:
762762
763763
# Test selective build
764764
PYTHON_EXECUTABLE=python bash examples/wasm/test_build_wasm.sh
765+
766+
unittest-nxp-neutron:
767+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
768+
permissions:
769+
id-token: write
770+
contents: read
771+
with:
772+
runner: linux.2xlarge
773+
docker-image: executorch-ubuntu-22.04-clang12
774+
submodules: 'recursive'
775+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
776+
timeout: 90
777+
script: |
778+
set -eux
779+
780+
# The generic Linux job chooses to use base env, not the one setup by the image
781+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
782+
conda activate "${CONDA_ENV}"
783+
784+
# Build and install Executorch
785+
PYTHON_EXECUTABLE=python \
786+
CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \
787+
.ci/scripts/setup-linux.sh --build-tool "cmake"
788+
789+
# Install test requirements
790+
pip install -r backends/nxp/requirements-tests.txt
791+
792+
# Run pytest
793+
PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh
794+
795+
# Run aot example:
796+
PYTHON_EXECUTABLE=python bash examples/nxp/run_aot_example.sh
797+
798+
799+
nxp-build-test:
800+
name: nxp-build-test
801+
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
802+
permissions:
803+
id-token: write
804+
contents: read
805+
with:
806+
runner: linux.2xlarge
807+
docker-image: executorch-ubuntu-22.04-arm-sdk
808+
submodules: 'recursive'
809+
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
810+
timeout: 90
811+
script: |
812+
# The generic Linux job chooses to use base env, not the one setup by the image
813+
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
814+
conda activate "${CONDA_ENV}"
815+
816+
# Build
817+
cmake -DEXECUTORCH_BUILD_NXP_NEUTRON=ON -Bcmake-out .
818+
cmake --build cmake-out --target executorch_delegate_neutron --config Release
819+
820+
# Build check for the neutron backend library
821+
lib_neutron="cmake-out/backends/nxp/libexecutorch_delegate_neutron.a"
822+
if [ -f $lib_neutron ]; then
823+
echo "Neutron backend library built."
824+
else
825+
echo "Neutron backend library not found!"
826+
exit 1
827+
fi

.github/workflows/trunk.yml

Lines changed: 14 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -302,8 +302,8 @@ jobs:
302302
exit 1
303303
fi
304304
305-
nxp-build-test:
306-
name: nxp-build-test
305+
test-arm-ootb-linux:
306+
name: test-arm-ootb-linux
307307
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
308308
permissions:
309309
id-token: write
@@ -319,18 +319,19 @@ jobs:
319319
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
320320
conda activate "${CONDA_ENV}"
321321
322-
# Build
323-
cmake -DEXECUTORCH_BUILD_NXP_NEUTRON=ON -Bcmake-out .
324-
cmake --build cmake-out --target executorch_delegate_neutron --config Release
322+
# Follow the steps required before running the notebooks
323+
# Try to mirror these as closely as possible
324+
source .ci/scripts/utils.sh
325+
install_executorch "--use-pt-pinned-commit"
325326
326-
# Build check for the neutron backend library
327-
lib_neutron="cmake-out/backends/nxp/libexecutorch_delegate_neutron.a"
328-
if [ -f $lib_neutron ]; then
329-
echo "Neutron backend library built."
330-
else
331-
echo "Neutron backend library not found!"
332-
exit 1
333-
fi
327+
.ci/scripts/setup-arm-baremetal-tools.sh
328+
source examples/arm/ethos-u-scratch/setup_path.sh
329+
330+
# Install requirements for converting notebooks
331+
pip install notebook
332+
333+
# Run OOTB tests
334+
backends/arm/test/test_arm_ootb.sh
334335
335336
test-coreml-delegate:
336337
name: test-coreml-delegate
@@ -771,32 +772,3 @@ jobs:
771772
build-mode: Release
772773
build-tool: cmake
773774
docker-image: executorch-ubuntu-22.04-clang12
774-
775-
unittest-nxp-neutron:
776-
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
777-
permissions:
778-
id-token: write
779-
contents: read
780-
with:
781-
runner: linux.2xlarge
782-
docker-image: executorch-ubuntu-22.04-clang12
783-
submodules: 'recursive'
784-
ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
785-
timeout: 90
786-
script: |
787-
set -eux
788-
789-
# The generic Linux job chooses to use base env, not the one setup by the image
790-
CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
791-
conda activate "${CONDA_ENV}"
792-
793-
# Build and install Executorch
794-
PYTHON_EXECUTABLE=python \
795-
CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \
796-
.ci/scripts/setup-linux.sh --build-tool "cmake"
797-
798-
# Install test requirements
799-
pip install -r backends/nxp/requirements-tests.txt
800-
801-
# Run pytest
802-
PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh

CMakeLists.txt

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,6 @@
4848
cmake_minimum_required(VERSION 3.24)
4949
project(executorch)
5050

51-
# MARK: - Start EXECUTORCH_H12025_BUILD_MIGRATION
52-
5351
include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
5452
include(${PROJECT_SOURCE_DIR}/tools/cmake/Utils.cmake)
5553
include(CMakeDependentOption)
@@ -82,6 +80,7 @@ announce_configured_options(BUCK2)
8280

8381
announce_configured_options(CMAKE_CXX_COMPILER_ID)
8482
announce_configured_options(CMAKE_TOOLCHAIN_FILE)
83+
announce_configured_options(BUILD_TESTING)
8584

8685
load_build_preset()
8786
include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
@@ -97,11 +96,6 @@ else()
9796
endif()
9897
announce_configured_options(CCACHE_PROGRAM)
9998

100-
# Print all the configs that were called with announce_configured_options.
101-
print_configured_options()
102-
103-
# MARK: - End EXECUTORCH_H12025_BUILD_MIGRATION
104-
10599
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
106100

107101
# Setup RPATH. See
@@ -112,11 +106,12 @@ set(CMAKE_SKIP_BUILD_RPATH OFF)
112106
set(CMAKE_BUILD_WITH_INSTALL_RPATH ON)
113107
# Automatically add all linked folders that are NOT in the build directory to
114108
# the rpath (per library?)
115-
# TODO: Doesn't work for us right now because we are
116-
# not installing .so's into the correct locations. For example we have
117-
# libcustom_ops_aot_lib.so depending on _portable_lib.so, which was eventually
118-
# put under <site-packages>/executorch/extension/pybindings/ but this rpath is
119-
# not automatically added because at build time it seems `portable_lib` is being
109+
#
110+
# TODO: Doesn't work for us right now because we are not installing .so's into
111+
# the correct locations. For example we have libcustom_ops_aot_lib.so depending
112+
# on _portable_lib.so, which was eventually put under
113+
# <site-packages>/executorch/extension/pybindings/ but this rpath is not
114+
# automatically added because at build time it seems `portable_lib` is being
120115
# built under the same directory, so no extra rpath is being added. To properly
121116
# fix this we need to install `portable_lib` into the correct path.
122117
set(CMAKE_INSTALL_RPATH_USE_LINK_PATH ON)
@@ -321,8 +316,9 @@ if(EXECUTORCH_USE_CPP_CODE_COVERAGE)
321316
" -fprofile-instr-generate -fcoverage-mapping"
322317
)
323318
else()
324-
message(FATAL_ERROR
325-
"Code coverage for compiler ${CMAKE_CXX_COMPILER_ID} is unsupported"
319+
message(
320+
FATAL_ERROR
321+
"Code coverage for compiler ${CMAKE_CXX_COMPILER_ID} is unsupported"
326322
)
327323
endif()
328324
endif()
@@ -633,8 +629,8 @@ if(EXECUTORCH_BUILD_PYBIND)
633629
endif()
634630

635631
if(EXECUTORCH_BUILD_XNNPACK)
636-
# need to explicitly specify XNNPACK and xnnpack-microkernels-prod here otherwise
637-
# uses XNNPACK and microkernel-prod symbols from libtorch_cpu
632+
# need to explicitly specify XNNPACK and xnnpack-microkernels-prod here
633+
# otherwise uses XNNPACK and microkernel-prod symbols from libtorch_cpu
638634
list(APPEND _dep_libs xnnpack_backend XNNPACK xnnpack-microkernels-prod)
639635
endif()
640636

@@ -748,3 +744,6 @@ if(EXECUTORCH_BUILD_ANDROID_JNI)
748744
endif()
749745

750746
include(Test.cmake)
747+
748+
# Print all the configs that were called with announce_configured_options.
749+
print_configured_options()

CMakePresets.json

Lines changed: 8 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
},
99
{
1010
"name": "macos",
11-
"displayName": "Build everything buildable on macOS",
11+
"displayName": "Build ExecuTorch for macOS",
1212
"inherits": ["common"],
1313
"generator": "Xcode",
1414
"cacheVariables": {
@@ -25,7 +25,7 @@
2525
},
2626
{
2727
"name": "ios",
28-
"displayName": "Build everything buildable on iOS",
28+
"displayName": "Build ExecuTorch for iOS",
2929
"inherits": ["common"],
3030
"generator": "Xcode",
3131
"cacheVariables": {
@@ -42,7 +42,7 @@
4242
},
4343
{
4444
"name": "ios-simulator",
45-
"displayName": "Build everything buildable on iOS simulator",
45+
"displayName": "Build ExecuTorch for iOS Simulator",
4646
"inherits": ["common"],
4747
"generator": "Xcode",
4848
"cacheVariables": {
@@ -59,7 +59,7 @@
5959
},
6060
{
6161
"name": "linux",
62-
"displayName": "Build everything buildable on Linux",
62+
"displayName": "Build ExecuTorch for Linux",
6363
"inherits": ["common"],
6464
"cacheVariables": {
6565
"CMAKE_SYSTEM_NAME": "Linux",
@@ -88,29 +88,21 @@
8888
{
8989
"name": "llm",
9090
"displayName": "Build LLM libraries",
91-
"inherits": [
92-
"common"
93-
],
91+
"inherits": ["common"],
9492
"cacheVariables": {
9593
"EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/llm.cmake",
9694
"CMAKE_OSX_DEPLOYMENT_TARGET": "12.0"
9795
},
9896
"condition": {
9997
"type": "inList",
10098
"string": "${hostSystemName}",
101-
"list": [
102-
"Darwin",
103-
"Linux",
104-
"Windows"
105-
]
99+
"list": ["Darwin", "Linux", "Windows"]
106100
}
107101
},
108102
{
109103
"name": "zephyr",
110-
"displayName": "Build everything buildable on Zephyr RTOS",
111-
"inherits": [
112-
"common"
113-
],
104+
"displayName": "Build ExecuTorch for Zephyr RTOS",
105+
"inherits": ["common"],
114106
"cacheVariables": {
115107
"EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/zephyr.cmake",
116108
"CMAKE_TOOLCHAIN_FILE": "${sourceDir}/examples/zephyr/x86_64-linux-arm-zephyr-eabi-gcc.cmake"

0 commit comments

Comments
 (0)