Skip to content

Commit d43de86

Browse files
committed
Update base for Update on "[ET-VK] New implementation of cat operator"
## Changes * Introduce `concat_texture.glsl` and `concat_buffer.glsl` to implement the `torch.cat` operator * Introduce `Concat.cpp` to replace `Cat.cpp` * Fix a bug with channels-packed buffer tensors where input data would be copied incorrectly with multiple dims have a stride of 1 ## Motivation > * Introduce `concat_texture.glsl` and `concat_buffer.glsl` to implement the `torch.cat` operator > * Introduce `Concat.cpp` to replace `Cat.cpp` The existing implementation of `torch.cat` uses the copy_channel_offset` shaders. However, these shaders have a critical bug where the output tensor is passed in separately with difference access types, i.e. ``` graph.execute_nodes().emplace_back(new DispatchNode( graph, VK_KERNEL_FROM_STR(kernel_name), global_size, local_size, // Inputs and Outputs { {out, vkapi::kWrite}, {out, vkapi::kRead}, {in, vkapi::kRead}, }, ``` This creates many validation layer errors because the memory barriers for the resource cannot be formed properly. The shader essentially relies on undefined behaviour to work correctly. The result is that the `cat` operator produces incorrect result on many platforms. Rather than fix the `copy_offset` shaders, I decided to just introduce new shaders to perform the concat operation. The new implementation handles both buffer and texture inputs and is agnostic to memory layout. Differential Revision: [D76305343](https://our.internmc.facebook.com/intern/diff/D76305343/) [ghstack-poisoned]
2 parents c2aa614 + 4a14fdd commit d43de86

File tree

316 files changed

+10285
-3176
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

316 files changed

+10285
-3176
lines changed

.ci/scripts/test_llama.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -156,8 +156,7 @@ cmake_install_executorch_libraries() {
156156
-DCMAKE_INSTALL_PREFIX=cmake-out \
157157
-DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
158158
-DEXECUTORCH_BUILD_QNN="$QNN" \
159-
-DQNN_SDK_ROOT="$QNN_SDK_ROOT" \
160-
-Bcmake-out .
159+
-DQNN_SDK_ROOT="$QNN_SDK_ROOT"
161160
cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"
162161
}
163162

.ci/scripts/unittest-buck2.sh

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ buck2 query "//backends/apple/... + //backends/example/... + \
1515
//kernels/optimized/... + //kernels/portable/... + //kernels/quantized/... + \
1616
//kernels/test/... + //runtime/... + //schema/... + //test/... + //util/..."
1717

18-
UNBUILDABLE_OPTIMIZED_OPS_REGEX="gelu|fft_r2c|log_softmax"
18+
UNBUILDABLE_OPTIMIZED_OPS_REGEX="_elu|gelu|fft|log_softmax"
1919
BUILDABLE_OPTIMIZED_OPS=$(buck2 query //kernels/optimized/cpu/... | grep -E -v $UNBUILDABLE_OPTIMIZED_OPS_REGEX)
2020

2121
# TODO: build prim_ops_test_cpp again once supported_features works in
@@ -24,6 +24,10 @@ BUILDABLE_KERNELS_PRIM_OPS_TARGETS=$(buck2 query //kernels/prim_ops/... | grep -
2424
# TODO: expand the covered scope of Buck targets.
2525
# //runtime/kernel/... is failing because //third-party:torchgen_files's shell script can't find python on PATH.
2626
# //runtime/test/... requires Python torch, which we don't have in our OSS buck setup.
27-
buck2 test $BUILDABLE_OPTIMIZED_OPS //kernels/portable/... \
28-
$BUILDABLE_KERNELS_PRIM_OPS_TARGETS //runtime/backend/... //runtime/core/... \
29-
//runtime/executor: //runtime/kernel/... //runtime/platform/...
27+
for op in "build" "test"; do
28+
buck2 $op $BUILDABLE_OPTIMIZED_OPS \
29+
//examples/selective_build:select_all_dtype_selective_lib_portable_lib \
30+
//kernels/portable/... \
31+
$BUILDABLE_KERNELS_PRIM_OPS_TARGETS //runtime/backend/... //runtime/core/... \
32+
//runtime/executor: //runtime/kernel/... //runtime/platform/...
33+
done

.github/workflows/trunk.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ jobs:
262262
output=$(ls -la ${elf})
263263
arr=($output)
264264
size=${arr[4]}
265-
threshold="103068" # ~100KiB
265+
threshold="103268" # ~100KiB
266266
echo "size: $size, threshold: $threshold"
267267
if [[ "$size" -le "$threshold" ]]; then
268268
echo "Success $size <= $threshold"

CMakeLists.txt

Lines changed: 4 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -430,14 +430,6 @@ endif()
430430

431431
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations)
432432

433-
#
434-
# gflags: Commandline flag host library.
435-
#
436-
437-
if(EXECUTORCH_BUILD_GFLAGS)
438-
add_subdirectory(third-party/gflags)
439-
endif()
440-
441433
# Install `executorch` library as well as `executorch-config.cmake` under
442434
# ${CMAKE_INSTALL_PREFIX}/
443435
install(
@@ -522,17 +514,6 @@ if(EXECUTORCH_BUILD_CORTEX_M)
522514
endif()
523515

524516
if(EXECUTORCH_BUILD_DEVTOOLS)
525-
if(NOT EXECUTORCH_BUILD_ARM_BAREMETAL)
526-
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
527-
ON
528-
CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
529-
)
530-
else()
531-
set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
532-
OFF
533-
CACHE BOOL "EXECUTORCH_BUILD_EXTENSION_DATA_LOADER" FORCE
534-
)
535-
endif()
536517
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/devtools)
537518
endif()
538519

@@ -573,6 +554,10 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)
573554
endif()
574555

575556
if(EXECUTORCH_BUILD_PYBIND)
557+
558+
# Add codegen tools subdirectory for selective_build pybind module
559+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/codegen/tools)
560+
576561
if(NOT EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
577562
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader)
578563
endif()

backends/apple/coreml/scripts/build_tests.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,7 @@ cmake "$EXECUTORCH_ROOT_PATH" -B"$CMAKE_EXECUTORCH_BUILD_DIR_PATH" \
3333
-DPLATFORM=MAC_UNIVERSAL \
3434
-DDEPLOYMENT_TARGET=13.0 \
3535
-DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
36-
-DEXECUTORCH_BUILD_XNNPACK=OFF \
37-
-DEXECUTORCH_BUILD_GFLAGS=OFF
36+
-DEXECUTORCH_BUILD_XNNPACK=OFF
3837

3938
cmake --build "$CMAKE_EXECUTORCH_BUILD_DIR_PATH" -j9 -t executorch
4039

backends/arm/_passes/arm_pass_manager.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,10 @@
6262
UnsqueezeScalarPlaceholdersPass,
6363
)
6464

65-
from executorch.backends.arm.tosa_specification import TosaSpecification
65+
from executorch.backends.arm.tosa_specification import (
66+
TosaLoweringContext,
67+
TosaSpecification,
68+
)
6669
from executorch.backends.transforms.decompose_sdpa import (
6770
DecomposeScaledDotProductAttention,
6871
)
@@ -80,7 +83,8 @@ def __init__(self, tosa_spec: TosaSpecification) -> None:
8083
super().__init__()
8184

8285
def _transform(self, graph_module: GraphModule):
83-
return self(graph_module).graph_module
86+
with TosaLoweringContext(self.tosa_spec):
87+
return self(graph_module).graph_module
8488

8589
def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
8690
self.add_pass(FuseQuantizedActivationPass())

backends/arm/_passes/scalars_to_attribute_pass.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,8 @@
1212
from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
1313

1414
from executorch.exir.pass_base import ExportPass, PassResult
15-
from torch.ao.quantization.fx.utils import get_new_attr_name_with_prefix
1615
from torch.fx import GraphModule, Node
16+
from torchao.quantization.pt2e.utils import get_new_attr_name_with_prefix
1717

1818

1919
class ScalarsToAttributePass(ExportPass):

backends/arm/quantizer/arm_quantizer.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -247,9 +247,9 @@ def set_module_name(
247247
quantizer.set_module_name("blocks.sub"), it will quantize all supported operator/operator
248248
patterns in the submodule with this module name with the given `quantization_config`
249249
"""
250-
assert (
251-
quantization_config is not None
252-
), " quantization_config == None is not supported yet"
250+
# Validate that quantization_config is provided
251+
if quantization_config is None:
252+
raise ValueError("quantization_config == None is not supported yet")
253253
self.module_name_config[module_name] = quantization_config
254254
return self
255255

backends/arm/quantizer/quantization_annotator.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -221,10 +221,12 @@ def _match_pattern(
221221
torch.ops.aten.squeeze_copy.dim,
222222
torch.ops.aten.squeeze.dim,
223223
torch.ops.aten.squeeze.dims,
224+
torch.ops.aten.unbind.int,
224225
torch.ops.aten.unsqueeze.default,
225226
torch.ops.aten.unsqueeze_copy.default,
226227
torch.ops.aten.reshape.default,
227228
torch.ops.aten.repeat.default,
229+
torch.ops.aten.repeat_interleave.self_int,
228230
torch.ops.aten.expand_copy.default,
229231
torch.ops.aten.expand.default,
230232
# Disabling these as there seems to be an issue with support for complex
@@ -256,6 +258,7 @@ def _match_pattern(
256258
torch.ops.aten.amin.default,
257259
torch.ops.aten.clamp.default,
258260
torch.ops.aten.clamp.Tensor,
261+
torch.ops.aten.unflatten.int,
259262
]
260263

261264
_one_to_one_shared_input_or_input_act_qspec = [
@@ -271,6 +274,7 @@ def _match_pattern(
271274
torch.ops.aten.avg_pool2d.default,
272275
torch.ops.aten.max_pool2d.default,
273276
torch.ops.aten.full.default,
277+
torch.ops.aten.full,
274278
torch.ops.aten.flatten.using_ints,
275279
torch.ops.aten.dropout.default,
276280
torch.ops.aten.dropout_.default,
@@ -539,6 +543,7 @@ def annotate_graph( # type: ignore[return]
539543
if node.target in [
540544
torch.ops.aten.full_like.default,
541545
torch.ops.aten.full.default,
546+
torch.ops.aten.full,
542547
torch.ops.aten.scalar_tensor.default,
543548
]:
544549
node.kwargs = {}

backends/arm/quantizer/quantization_config.py

Lines changed: 26 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,30 +29,40 @@ def get_input_act_qspec(self) -> QuantizationSpec | None:
2929
"""Returns QuantizationSpec 'input_activation' after asserting that input_activation.qscheme is valid."""
3030
if self.input_activation is None:
3131
return None
32-
assert self.input_activation.qscheme in [
32+
# Validate that input_activation uses a supported qscheme
33+
if self.input_activation.qscheme not in [
3334
torch.per_tensor_affine,
3435
torch.per_tensor_symmetric,
35-
], f"Unsupported quantization_spec {self.input_activation} for input_activation."
36+
]:
37+
raise ValueError(
38+
f"Unsupported quantization_spec {self.input_activation} for input_activation."
39+
)
3640
return self.input_activation
3741

3842
def get_output_act_qspec(self) -> QuantizationSpec | None:
3943
"""Returns QuantizationSpec 'output_activation' after asserting that output_activation.qscheme is valid."""
4044
if self.output_activation is None:
4145
return None
42-
assert self.output_activation.qscheme in [
46+
# Validate that output_activation uses a supported qscheme
47+
if self.output_activation.qscheme not in [
4348
torch.per_tensor_affine,
4449
torch.per_tensor_symmetric,
45-
], f"Unsupported quantization_spec {self.output_activation} for output_activation."
50+
]:
51+
raise ValueError(
52+
f"Unsupported quantization_spec {self.output_activation} for output_activation."
53+
)
4654
return self.output_activation
4755

4856
def get_weight_qspec(self) -> QuantizationSpec | None:
4957
"""Returns QuantizationSpec 'weight' after asserting that weight.qscheme is valid."""
5058
if self.weight is None:
5159
return None
52-
assert self.weight.qscheme in [
60+
# Validate that weight uses a supported qscheme
61+
if self.weight.qscheme not in [
5362
torch.per_tensor_symmetric,
5463
torch.per_channel_symmetric,
55-
], f"Unsupported quantization_spec {self.weight} for weight"
64+
]:
65+
raise ValueError(f"Unsupported quantization_spec {self.weight} for weight")
5666
return self.weight
5767

5868
def get_bias_qspec(self, node: torch.fx.Node) -> QuantizationSpec | None:
@@ -61,11 +71,11 @@ def get_bias_qspec(self, node: torch.fx.Node) -> QuantizationSpec | None:
6171
def _derive_qparams_fn(
6272
obs_or_fqs: list[ObserverOrFakeQuantize],
6373
) -> tuple[torch.Tensor, torch.Tensor]:
64-
assert (
65-
len(obs_or_fqs) == 2
66-
), "Expecting two obs/fqs, one for activation and one for weight, got: {}".format(
67-
len(obs_or_fqs)
68-
)
74+
# Validate expected number of observers/fake-quantizes
75+
if len(obs_or_fqs) != 2:
76+
raise ValueError(
77+
f"Expecting two obs/fqs, one for activation and one for weight, got: {len(obs_or_fqs)}"
78+
)
6979
act_obs_or_fq = obs_or_fqs[0]
7080
weight_obs_or_fq = obs_or_fqs[1]
7181
act_scale, act_zp = act_obs_or_fq.calculate_qparams()
@@ -94,9 +104,11 @@ def _derive_qparams_fn(
94104

95105
if self.bias is None:
96106
return None
97-
assert (
98-
self.bias.dtype == torch.float
99-
), "Only float dtype for bias is supported for bias right now"
107+
# Validate that bias dtype is floating-point
108+
if self.bias.dtype != torch.float:
109+
raise ValueError(
110+
"Only float dtype for bias is supported for bias right now"
111+
)
100112
return self.bias
101113

102114
def get_fixed_qspec(

0 commit comments

Comments
 (0)