Skip to content

Commit be47894

Browse files
author
ssjia
committed
Update on "[ET-VK] Quantized Int8 Linear"
Title says it all! This PR adds implementations for int8 linear layers. Convolution is implemented in a later step, computing convolution as matrix multiplication via the im2col procedure. For both linear and convolution, two versions are implemented: 1. `q8ta_q8csw` variant which quantized the input tensor and then performs integer accumulation via the int8 dot product extension 2. `q8csw` variant which dequantized the weight tensor in-shader and performs floating point accumulation. The second one is needed to provide an alternative path for executing quantized models if the target GPU does not support int8 dot product extension. These new ops are tested via the custom op testing + benchmarking framework introduced in the previous diff. Differential Revision: [D81323424](https://our.internmc.facebook.com/intern/diff/D81323424/) [ghstack-poisoned]
2 parents e2c6cd4 + b6b851c commit be47894

File tree

82 files changed

+1639
-669
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+1639
-669
lines changed

.ci/scripts/test_model.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ test_model() {
9797
bash examples/models/llava/install_requirements.sh
9898
STRICT="--no-strict"
9999
fi
100-
if [[ "${MODEL_NAME}" == "qwen2_5" ]]; then
100+
if [[ "${MODEL_NAME}" == "qwen2_5_1_5b" ]]; then
101101
# Install requirements for export_llama
102102
bash examples/models/llama/install_requirements.sh
103103
# Test export_llm script: python3 -m extension.llm.export.export_llm.

.github/workflows/trunk.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -176,7 +176,7 @@ jobs:
176176
- model: phi_4_mini
177177
backend: portable
178178
runner: linux.arm64.m7g.4xlarge
179-
- model: qwen2_5
179+
- model: qwen2_5_1_5b
180180
backend: portable
181181
runner: linux.arm64.2xlarge
182182
- model: llama3_2_vision_encoder

CMakeLists.txt

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -699,9 +699,7 @@ if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
699699
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
700700
${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
701701
)
702-
add_subdirectory(
703-
${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental
704-
)
702+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/csrc/cpu)
705703
unset(EXECUTORCH_INCLUDE_DIRS)
706704

707705
executorch_target_link_options_shared_lib(torchao_ops_executorch)

backends/apple/coreml/compiler/torch_ops.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,6 @@ def dequantize_affine(context, node):
175175
int_data.astype(quantized_np_dtype),
176176
zero_point,
177177
scale,
178-
axis=-1,
179178
name=node.name,
180179
)
181180
context.add(output, node.name)

backends/apple/coreml/test/test_torch_ops.py

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,9 +27,9 @@
2727
class TestTorchOps(unittest.TestCase):
2828
edge_compile_config = executorch.exir.EdgeCompileConfig()
2929

30-
def _coreml_partitioner(self):
30+
def _coreml_partitioner(self, *, minimum_deployment_target=ct.target.iOS18):
3131
compile_specs = CoreMLBackend.generate_compile_specs(
32-
minimum_deployment_target=ct.target.iOS18
32+
minimum_deployment_target=minimum_deployment_target
3333
)
3434
return CoreMLPartitioner(compile_specs=compile_specs)
3535

@@ -158,6 +158,33 @@ def test_dequantize_affine_c8w_embedding_b4w_linear(self):
158158
et_prog = delegated_program.to_executorch()
159159
self._compare_outputs(et_prog, model, example_inputs)
160160

161+
def test_dequantize_affine_c8w_embedding_c8w_linear_ios16(self):
162+
model, example_inputs = self._get_test_model()
163+
quantize_(
164+
model,
165+
IntxWeightOnlyConfig(weight_dtype=torch.int8, granularity=PerAxis(0)),
166+
lambda m, fqn: isinstance(m, torch.nn.Embedding),
167+
)
168+
quantize_(
169+
model,
170+
IntxWeightOnlyConfig(weight_dtype=torch.int8, granularity=PerAxis(0)),
171+
)
172+
ep = torch.export.export(model, example_inputs)
173+
delegated_program = executorch.exir.to_edge_transform_and_lower(
174+
ep,
175+
partitioner=[
176+
self._coreml_partitioner(minimum_deployment_target=ct.target.iOS16)
177+
],
178+
)
179+
for node in delegated_program.exported_program().graph.nodes:
180+
if node.op == "call_function":
181+
assert node.target.__name__ in [
182+
"executorch_call_delegate",
183+
"getitem",
184+
], f"Got unexpected node target after delegation: {node.target.__name__}"
185+
et_prog = delegated_program.to_executorch()
186+
self._compare_outputs(et_prog, model, example_inputs)
187+
161188
def test_dequantize_codebook_linear_per_grouped_col(self):
162189
model, example_inputs = self._get_test_model()
163190
quantize_(
@@ -298,6 +325,7 @@ def forward(self, x):
298325
test_runner.test_dequantize_affine_c4w_embedding()
299326
test_runner.test_dequantize_affine_c4w_linear()
300327
test_runner.test_dequantize_affine_c8w_embedding_b4w_linear()
328+
test_runner.test_dequantize_affine_c8w_embedding_c8w_linear_ios16()
301329
test_runner.test_dequantize_codebook_linear_per_grouped_col()
302330
test_runner.test_dequantize_codebook_linear_per_grouped_row()
303331
test_runner.test_dequantize_codebook_embedding_per_grouped_col()

backends/arm/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ For more information on TOSA see https://www.mlplatform.org/tosa/tosa_spec.html
3434
## Layout of key components
3535

3636
Export:
37-
* `tosa_backend.py` - The TOSA conversion flow all other backends rely on.
37+
* `tosa/backend.py` - The TOSA conversion flow all other backends rely on.
3838
* `ethosu/backend.py` - Main entrypoint for the EthosUBackend.
3939
* `vgf_backend.py` - Main entrypoint for VgfBackend.
4040
* For more information see the section on [Arm Backend Architecture](#arm-backend-architecture).

backends/arm/TARGETS

Lines changed: 6 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -37,14 +37,15 @@ python_library(
3737
python_library(
3838
name = "arm_partitioner",
3939
srcs = [
40-
"tosa_backend.py",
41-
"tosa_partitioner.py",
40+
"tosa/backend.py",
41+
"tosa/partitioner.py",
4242
"vgf_backend.py",
4343
"vgf_partitioner.py",
4444
],
4545
deps = [
4646
":arm_backend",
4747
":constants",
48+
"//executorch/backends/arm/debug:schema",
4849
"//executorch/backends/arm/operator_support:operator_support",
4950
"//executorch/backends/arm/_passes:passes",
5051
"//executorch/exir:lib",
@@ -76,9 +77,9 @@ python_library(
7677
"fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/tosa:tosa",
7778
"fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/tosa:tosa",
7879
"//executorch/backends/arm/operators:node_visitor",
79-
"//executorch/backends/arm:tosa_mapping",
80-
"//executorch/backends/arm:tosa_quant_utils",
81-
"//executorch/backends/arm:tosa_utils",
80+
"//executorch/backends/arm/tosa:mapping",
81+
"//executorch/backends/arm/tosa:quant_utils",
82+
"//executorch/backends/arm/tosa:utils",
8283
"//executorch/exir:lib",
8384
],
8485
)
@@ -91,54 +92,6 @@ python_library(
9192
"fbsource//third-party/pypi/ethos-u-vela:ethos-u-vela",
9293
],
9394
)
94-
python_library(
95-
name = "tosa_mapping",
96-
srcs = [
97-
"tosa_mapping.py",
98-
],
99-
deps = [
100-
"fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer",
101-
"fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/serializer:serializer",
102-
"//caffe2:torch",
103-
],
104-
)
105-
python_library(
106-
name = "tosa_quant_utils",
107-
srcs = [
108-
"tosa_quant_utils.py",
109-
],
110-
deps = [
111-
"fbsource//third-party/pypi/numpy:numpy",
112-
"fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer",
113-
"fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/serializer:serializer",
114-
"fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/tosa:tosa",
115-
"fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/tosa:tosa",
116-
":constants",
117-
":tosa_mapping",
118-
"//executorch/exir/dialects:lib",
119-
],
120-
)
121-
python_library(
122-
name = "tosa_specification",
123-
srcs = [
124-
"tosa_specification.py",
125-
],
126-
deps = [
127-
"fbsource//third-party/pypi/packaging:packaging",
128-
"//executorch/exir/backend:compile_spec_schema",
129-
],
130-
)
131-
python_library(
132-
name = "tosa_utils",
133-
srcs = [
134-
"tosa_utils.py",
135-
],
136-
deps = [
137-
"fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/serializer:serializer",
138-
":tosa_quant_utils",
139-
"//executorch/backends/arm/operators:node_visitor",
140-
],
141-
)
14295
python_library(
14396
name = "arm_model_evaluator",
14497
srcs = [

backends/arm/_passes/TARGETS

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,8 @@ python_library(
66
deps = [
77
"//executorch/backends/arm:common",
88
"//executorch/backends/arm:constants",
9-
"//executorch/backends/arm:tosa_quant_utils",
10-
"//executorch/backends/arm:tosa_utils",
9+
"//executorch/backends/arm/tosa:quant_utils",
10+
"//executorch/backends/arm/tosa:utils",
1111
"//executorch/backends/arm/tosa/dialect:lib",
1212
"//executorch/backends/transforms:fuse_view_copy",
1313
"//executorch/backends/transforms:remove_getitem_op",

backends/arm/_passes/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@
3737
from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass # noqa
3838
from .decompose_cumsum_pass import DecomposeCumsumPass # noqa
3939
from .decompose_div_pass import DecomposeDivPass # noqa
40+
from .decompose_div_tensor_mode import DecomposeDivTensorModePass # noqa
4041
from .decompose_elu_pass import DecomposeEluPass # noqa
4142
from .decompose_embedding_pass import DecomposeEmbeddingPass # noqa # noqa
4243
from .decompose_expm1_pass import DecomposeExpm1Pass # noqa

backends/arm/_passes/arm_pass_manager.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
DecomposeCosineSimilarityPass,
4343
DecomposeCumsumPass,
4444
DecomposeDivPass,
45+
DecomposeDivTensorModePass,
4546
DecomposeEluPass,
4647
DecomposeEmbeddingPass,
4748
DecomposeExpm1Pass,
@@ -211,6 +212,7 @@ def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
211212
DecomposeMeanDimPass(exported_program.graph_module, self.tosa_spec)
212213
)
213214
self.add_pass(DecomposeNotEqualPass())
215+
self.add_pass(DecomposeDivTensorModePass())
214216
self.add_pass(DecomposeDivPass())
215217
self.add_pass(DecomposeSoftmaxPass())
216218
self.add_pass(DecomposeGeluPass())
@@ -289,6 +291,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
289291
self.add_pass(DecomposeNotEqualPass())
290292
self.add_pass(DecomposeCosineSimilarityPass())
291293
self.add_pass(DecomposeGluPass())
294+
self.add_pass(DecomposeDivTensorModePass())
292295
self.add_pass(DecomposeDivPass())
293296
self.add_pass(DecomposeLeakyReLUPass())
294297
self.add_pass(DecomposeLinearVectorNormPass())

0 commit comments

Comments
 (0)