Skip to content

Commit a6fc34a

Browse files
committed
Update on "[ET-VK][ez] Fix 8 bit linear compute shader dispatch"
## Context Currently, for the `q_8w_linear` shader, both the texture and the buffer variants use the same global work group and local work group setting. Specially, the global work group is set to `{out.numel(), 1, 1}` and the local work group is set to `{64, 1, 1}`. However, I believe this results in a very poor memory re-use for the texture shader. In this configuration: * Within a work group each invocation will be requesting a different row of A - 64 rows of A requested in total * All work groups will be requesting the same row of B * One work group will load 65 unique rows from A and B Compare this to a local work group size of `{8, 8, 1}` * Across the work group, 8 rows will be loaded from A and 8 rows will be loaded from B * One work group will load 16 unique rows total from A and B Evidently, there is better memory re-use in the latter work group as fewer unique rows are loaded. ## Changes Modify the `q_8w_linear` shader to use `{8, 8, 1}` local wg if possible. If `M` is small, then instead use `{4, 16, 1}` or `{2, 32, 1}` to reduce the number of inactive invocations. Differential Revision: [D71706489](https://our.internmc.facebook.com/intern/diff/D71706489/) [ghstack-poisoned]
2 parents 44aace8 + ca11834 commit a6fc34a

File tree

270 files changed

+2971
-2464
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

270 files changed

+2971
-2464
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
295f2ed4d103017f7e19a7b8263ece606cd629db
1+
7ae0ce6360b6e4f944906502d20da24c04debee5

.ci/scripts/build_android_instrumentation.sh

Lines changed: 6 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -12,30 +12,10 @@ if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
1212
fi
1313
which "${PYTHON_EXECUTABLE}"
1414

15-
build_android_test() {
16-
mkdir -p extension/android/executorch_android/src/androidTest/resources
17-
cp extension/module/test/resources/add.pte extension/android/executorch_android/src/androidTest/resources
18-
pushd extension/android
19-
ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:testDebugUnitTest
20-
ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:assembleAndroidTest
21-
popd
22-
}
15+
mkdir -p "${BUILD_AAR_DIR}"/executorch_android/src/androidTest/resources
16+
cp extension/module/test/resources/add.pte "${BUILD_AAR_DIR}"/executorch_android/src/androidTest/resources
2317

24-
collect_artifacts_to_be_uploaded() {
25-
ARTIFACTS_DIR_NAME="$1"
26-
# Collect Java library test
27-
JAVA_LIBRARY_TEST_DIR="${ARTIFACTS_DIR_NAME}/library_test_dir"
28-
mkdir -p "${JAVA_LIBRARY_TEST_DIR}"
29-
cp extension/android/executorch_android/build/outputs/apk/androidTest/debug/*.apk "${JAVA_LIBRARY_TEST_DIR}"
30-
}
31-
32-
main() {
33-
build_android_test
34-
if [ -n "$ARTIFACTS_DIR_NAME" ]; then
35-
collect_artifacts_to_be_uploaded ${ARTIFACTS_DIR_NAME}
36-
fi
37-
}
38-
39-
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
40-
main "$@"
41-
fi
18+
pushd "${BUILD_AAR_DIR}"
19+
ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:testDebugUnitTest
20+
ANDROID_HOME="${ANDROID_SDK:-/opt/android/sdk}" ./gradlew :executorch_android:assembleAndroidTest
21+
popd

.ci/scripts/gather_test_models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
"dl3": "linux.4xlarge.memory",
3434
"emformer_join": "linux.4xlarge.memory",
3535
"emformer_predict": "linux.4xlarge.memory",
36-
"phi-4-mini": "linux.4xlarge.memory",
36+
"phi_4_mini": "linux.4xlarge.memory",
3737
}
3838
}
3939

.ci/scripts/test_llama_torchao_lowbit.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,7 +78,6 @@ ${PYTHON_EXECUTABLE} -m examples.models.llama.export_llama \
7878
-qmode "torchao:8da${QLINEAR_BITWIDTH}w" \
7979
--group_size ${QLINEAR_GROUP_SIZE} \
8080
-E "torchao:${QEMBEDDING_BITWIDTH},${QEMBEDDING_GROUP_SIZE}" \
81-
--disable_dynamic_shape \
8281
-d fp32
8382

8483
# Test run

.ci/scripts/test_model.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,11 @@ test_model() {
100100
rm "./${MODEL_NAME}.pte"
101101
return # Skip running with portable executor runnner since portable doesn't support Qwen's biased linears.
102102
fi
103-
if [[ "${MODEL_NAME}" == "phi-4-mini" ]]; then
103+
if [[ "${MODEL_NAME}" == "phi_4_mini" ]]; then
104104
# Install requirements for export_llama
105105
bash examples/models/llama/install_requirements.sh
106106
# Test export_llama script: python3 -m examples.models.llama.export_llama.
107-
"${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/phi-4-mini/config.json
107+
"${PYTHON_EXECUTABLE}" -m examples.models.llama.export_llama --model "${MODEL_NAME}" -c examples/models/llama/params/demo_rand_params.pth -p examples/models/phi_4_mini/config.json
108108
run_portable_executor_runner
109109
rm "./${MODEL_NAME}.pte"
110110
return

.ci/scripts/unittest-linux.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,7 @@ if [[ "$BUILD_TOOL" == "cmake" ]]; then
2121
source .ci/scripts/setup-vulkan-linux-deps.sh
2222

2323
PYTHON_EXECUTABLE=python \
24-
EXECUTORCH_BUILD_PYBIND=ON \
25-
CMAKE_ARGS="-DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
24+
CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
2625
.ci/scripts/setup-linux.sh "$@"
2726

2827
# Install llama3_2_vision dependencies.

.ci/scripts/unittest-macos-buck2.sh

100644100755
File mode changed.

.ci/scripts/unittest-macos.sh

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,13 @@ export TMP_DIR=$(mktemp -d)
1919
export PATH="${TMP_DIR}:$PATH"
2020
trap 'rm -rfv ${TMP_DIR}' EXIT
2121

22-
if [[ "$BUILD_TOOL" == "cmake" ]]; then
23-
# Setup MacOS dependencies as there is no Docker support on MacOS atm
24-
PYTHON_EXECUTABLE=python \
25-
EXECUTORCH_BUILD_PYBIND=ON \
26-
CMAKE_ARGS="-DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
27-
${CONDA_RUN} --no-capture-output \
28-
.ci/scripts/setup-macos.sh "$@"
22+
# Setup MacOS dependencies as there is no Docker support on MacOS atm
23+
PYTHON_EXECUTABLE=python \
24+
CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_COREML=ON -DEXECUTORCH_BUILD_MPS=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
25+
${CONDA_RUN} --no-capture-output \
26+
.ci/scripts/setup-macos.sh "$@"
2927

28+
if [[ "$BUILD_TOOL" == "cmake" ]]; then
3029
# Install llama3_2_vision dependencies.
3130
PYTHON_EXECUTABLE=python \
3231
${CONDA_RUN} --no-capture-output \

.ci/scripts/utils.sh

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,46 @@ install_pytorch_and_domains() {
6060
# Fetch the target commit
6161
pushd pytorch || return
6262
git checkout "${TORCH_VERSION}"
63-
git submodule update --init --recursive
6463

65-
export USE_DISTRIBUTED=1
66-
# Then build and install PyTorch
67-
python setup.py bdist_wheel
68-
pip install "$(echo dist/*.whl)"
64+
local system_name=$(uname)
65+
if [[ "${system_name}" == "Darwin" ]]; then
66+
local platform=$(python -c 'import sysconfig; import platform; v=platform.mac_ver()[0].split(".")[0]; platform=sysconfig.get_platform().split("-"); platform[1]=f"{v}_0"; print("_".join(platform))')
67+
fi
68+
local python_version=$(python -c 'import platform; v=platform.python_version_tuple(); print(f"{v[0]}{v[1]}")')
69+
local torch_release=$(cat version.txt)
70+
local torch_short_hash=${TORCH_VERSION:0:7}
71+
local torch_wheel_path="cached_artifacts/pytorch/executorch/pytorch_wheels/${system_name}/${python_version}"
72+
local torch_wheel_name="torch-${torch_release}%2Bgit${torch_short_hash}-cp${python_version}-cp${python_version}-${platform:-}.whl"
73+
74+
local cached_torch_wheel="https://gha-artifacts.s3.us-east-1.amazonaws.com/${torch_wheel_path}/${torch_wheel_name}"
75+
# Cache PyTorch wheel is only needed on MacOS, Linux CI already has this as part
76+
# of the Docker image
77+
local torch_wheel_not_found=0
78+
if [[ "${system_name}" == "Darwin" ]]; then
79+
pip install "${cached_torch_wheel}" || torch_wheel_not_found=1
80+
else
81+
torch_wheel_not_found=1
82+
fi
83+
84+
# Found no such wheel, we will build it from source then
85+
if [[ "${torch_wheel_not_found}" == "1" ]]; then
86+
echo "No cached wheel found, continue with building PyTorch at ${TORCH_VERSION}"
87+
88+
git submodule update --init --recursive
89+
USE_DISTRIBUTED=1 python setup.py bdist_wheel
90+
pip install "$(echo dist/*.whl)"
91+
92+
# Only AWS runners have access to S3
93+
if command -v aws && [[ -z "${GITHUB_RUNNER:-}" ]]; then
94+
for wheel_path in dist/*.whl; do
95+
local wheel_name=$(basename "${wheel_path}")
96+
echo "Caching ${wheel_name}"
97+
aws s3 cp "${wheel_path}" "s3://gha-artifacts/${torch_wheel_path}/${wheel_name}"
98+
done
99+
fi
100+
else
101+
echo "Use cached wheel at ${cached_torch_wheel}"
102+
fi
69103

70104
# Grab the pinned audio and vision commits from PyTorch
71105
TORCHAUDIO_VERSION=$(cat .github/ci_commit_pins/audio.txt)

.ci/scripts/wheel/envvar_base.sh

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,10 @@
88
# should typically only contain shell variable assignments. Be sure to export
99
# any variables so that subprocesses will see them.
1010

11-
# Enable pybindings so that users can execute ExecuTorch programs from python.
12-
export EXECUTORCH_BUILD_PYBIND=1
13-
1411
# Ensure that CMAKE_ARGS is defined before referencing it. Defaults to empty
1512
# if not defined.
1613
export CMAKE_ARGS="${CMAKE_ARGS:-}"
1714

1815
# Link the XNNPACK backend into the pybindings runtime so that users can execute
1916
# ExecuTorch programs that delegate to it.
20-
CMAKE_ARGS="${CMAKE_ARGS} -DEXECUTORCH_BUILD_XNNPACK=ON"
17+
CMAKE_ARGS="${CMAKE_ARGS} -DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON"

0 commit comments

Comments
 (0)