Skip to content

Commit 5b5101c

Browse files
committed
Update on "[ET-VK] Adding push constant and ubo verison of select and slice ops to improve memory and performance."
Adding push constant and ubo verison of select and slice ops to improve memory and performance. * Updated `transfer_buffer.yaml` and `transfer_texture.yaml` to include `UBO_PARAMS` parameter and generate variants for `select` and `slice` ops with UBO parameters. * Updated `transfer.glsl` to generate ubo and push constant versions of `select` and `slice` ops with UBO parameters. Differential Revision: [D78095262](https://our.internmc.facebook.com/intern/diff/D78095262/) [ghstack-poisoned]
2 parents d746b7b + 932c419 commit 5b5101c

File tree

13 files changed

+547
-188
lines changed

13 files changed

+547
-188
lines changed

backends/cadence/aot/compiler.py

Lines changed: 28 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
import logging
1010
from pathlib import Path
11-
from typing import Optional
11+
from typing import Callable, cast, Optional
1212

1313
import executorch.backends.cadence.aot.ops_registrations # noqa
1414
import torch
@@ -32,6 +32,7 @@
3232
ExecutorchBackendConfig,
3333
ExecutorchProgramManager,
3434
)
35+
from executorch.exir.pass_base import PassResult
3536
from executorch.exir.passes import ToOutVarPass
3637
from executorch.exir.passes.sym_shape_eval_pass import HintBasedSymShapeEvalPass
3738
from executorch.exir.program._program import to_edge_with_preserved_ops
@@ -40,7 +41,7 @@
4041
from torch.export.exported_program import ExportedProgram
4142
from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
4243

43-
from .passes import apply_exir_ops_passes, apply_torch_ops_passes
44+
from .passes import get_cadence_passes
4445

4546
from .utils import print_ops_info
4647

@@ -261,20 +262,14 @@ def export_to_edge(
261262
inputs: tuple[object, ...],
262263
dump_graphs: bool = False,
263264
constant_methods: Optional[dict[str, object]] = None,
264-
core_aten_exceptions: Optional[list[torch._ops.OpOverload]] = None,
265265
) -> EdgeProgramManager:
266266
assert isinstance(model, torch.nn.Module), "model should be an nn.Module"
267267

268268
# Export the model into an ExportedProgram.
269269
expo_program = trace(model, inputs)
270270

271-
# Apply passes which transform the ExportedProgram before it gets lowered to edge.
272-
expo_program = apply_torch_ops_passes(expo_program)
273-
274271
# Lower the model to edge IR.
275-
edge_prog_manager = _lower_ep_to_edge(
276-
expo_program, dump_graphs, constant_methods, core_aten_exceptions
277-
)
272+
edge_prog_manager = _lower_ep_to_edge(expo_program, dump_graphs, constant_methods)
278273

279274
return edge_prog_manager
280275

@@ -316,7 +311,14 @@ def _lower_ep_to_cadence(
316311
Lower an existing ExportedProgram to edge IR and apply frontend optimization passes.
317312
"""
318313
edge_prog_manager = _lower_ep_to_edge(program, dump_graphs=dump_graphs)
319-
cadence_prog_manager = apply_exir_ops_passes(opt_level, edge_prog_manager)
314+
cadence_passes = get_cadence_passes(opt_level)
315+
316+
# Run a couple required passes for quant/dequant ops
317+
cadence_prog_manager = edge_prog_manager.transform(
318+
cast(
319+
list[Callable[[torch.fx.GraphModule], Optional[PassResult]]], cadence_passes
320+
)
321+
)
320322
return cadence_prog_manager
321323

322324

@@ -327,7 +329,14 @@ def export_to_cadence(
327329
opt_level: int = 1,
328330
) -> EdgeProgramManager:
329331
edge_prog_manager = export_to_edge(model, inputs, dump_graphs=dump_graphs)
330-
cadence_prog_manager = apply_exir_ops_passes(opt_level, edge_prog_manager)
332+
cadence_passes = get_cadence_passes(opt_level)
333+
334+
# Run a couple required passes for quant/dequant ops
335+
cadence_prog_manager = edge_prog_manager.transform(
336+
cast(
337+
list[Callable[[torch.fx.GraphModule], Optional[PassResult]]], cadence_passes
338+
)
339+
)
331340
return cadence_prog_manager
332341

333342

@@ -364,8 +373,15 @@ def export_to_executorch_gen_etrecord(
364373
memory_config: Optional[MemoryConfig] = None,
365374
dump_graphs: bool = False,
366375
) -> ExecutorchProgramManager:
376+
cadence_passes = get_cadence_passes(opt_level)
367377
edge_prog_manager = export_to_edge(model, inputs, dump_graphs)
368-
cadence_prog_manager = apply_exir_ops_passes(opt_level, edge_prog_manager)
378+
379+
# Run a couple required passes for quant/dequant ops
380+
cadence_prog_manager = edge_prog_manager.transform(
381+
cast(
382+
list[Callable[[torch.fx.GraphModule], Optional[PassResult]]], cadence_passes
383+
)
384+
)
369385

370386
# Print some information to terminal
371387
print_ops_info(

backends/cadence/aot/fuse_ops.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1127,7 +1127,6 @@ class CadenceFuseOpsInGraph:
11271127
FuseCascadedTransposeOrPermuteOps,
11281128
FuseCascadedViewOps,
11291129
FuseQuantDequantToRequantizePass,
1130-
FuseMulTensorIntoQuantPass,
11311130
FuseMulTensorIntoDequantPass,
11321131
FuseMulScalarIntoDequantPass,
11331132
FuseFullThenReshapePass,

backends/cadence/aot/passes.py

Lines changed: 8 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
# pyre-strict
88

9-
from typing import Any, Callable, cast, List, Optional
9+
from typing import Any, List, Optional
1010

1111
import torch
1212
import torch.fx
@@ -28,18 +28,13 @@
2828
RemoveRedundantOps,
2929
)
3030
from executorch.backends.cadence.aot.reorder_ops import CadenceReorderOpsInGraph
31-
from executorch.backends.cadence.aot.replace_ops import (
32-
CadenceReplaceOpsInGraph,
33-
ReplaceMulTensorWithMulAndFullOpsPass,
34-
)
31+
from executorch.backends.cadence.aot.replace_ops import CadenceReplaceOpsInGraph
3532
from executorch.backends.cadence.aot.simplify_ops import CadenceSimplifyOpsInGraph
36-
from executorch.exir import EdgeProgramManager
3733
from executorch.exir.pass_base import ExportPass, PassResult
3834
from executorch.exir.pass_manager import PassManager, PassType
3935
from executorch.exir.passes import dead_code_elimination_pass
4036
from executorch.exir.passes.scalar_to_tensor_pass import ScalarToTensorPass
4137
from executorch.exir.passes.spec_prop_pass import SpecPropPass
42-
from torch.export.exported_program import ExportedProgram
4338

4439

4540
@register_cadence_pass(CadencePassAttribute(opt_level=0))
@@ -94,37 +89,14 @@ def get_passes_in_default_order() -> List[ExportPass]:
9489
return pytree.tree_flatten(passes)[0]
9590

9691

97-
def apply_exir_ops_passes(
92+
def get_cadence_passes(
9893
opt_level: int,
99-
edge_prog_manager: EdgeProgramManager,
100-
) -> EdgeProgramManager:
94+
) -> List[Optional[PassResult]]:
10195
passes = get_passes_in_default_order()
10296
pass_filter = create_cadence_pass_filter(opt_level)
103-
cadence_passes = [
104-
(
105-
lambda graph_module, filtered_pass=filtered_pass: filtered_pass()(
106-
graph_module
107-
)
108-
)
97+
filtered_passes = [
98+
# pyre-ignore[20]: Expect argument graph_module
99+
filtered_pass()
109100
for filtered_pass in list(filter(pass_filter, passes))
110101
]
111-
cadence_prog_manager = edge_prog_manager.transform(
112-
cast(
113-
list[Callable[[torch.fx.GraphModule], Optional[PassResult]]], cadence_passes
114-
)
115-
)
116-
return cadence_prog_manager
117-
118-
119-
def apply_torch_ops_passes(expo_program: ExportedProgram) -> ExportedProgram:
120-
"""
121-
Applies compiler passes on torch.ops IR, including torch.ops.aten, torch.ops.cadence, etc.
122-
expo_program is expected to be the output of the torch.export.export().
123-
"""
124-
125-
aten_passes: List[Callable[[torch.fx.GraphModule], Optional[PassResult]]] = [
126-
ReplaceMulTensorWithMulAndFullOpsPass()
127-
]
128-
# TODO(T230417247): Use PassResult which is currently ignored.
129-
PassManager(aten_passes)(expo_program.graph_module)
130-
return expo_program
102+
return filtered_passes

examples/demo-apps/apple_ios/LLaMA/LLaMA.xcodeproj/project.pbxproj

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,8 @@
485485
03CF43A52CEC5CEC00C7113B /* kernels_custom_debug */,
486486
03CF43A72CEC5CEC00C7113B /* kernels_optimized */,
487487
03CF43A92CEC5CEC00C7113B /* kernels_optimized_debug */,
488+
03CF43AB2CEC5CEC00C7113B /* kernels_portable */,
489+
03CF43AD2CEC5CEC00C7113B /* kernels_portable_debug */,
488490
03CF43AF2CEC5CEC00C7113B /* kernels_quantized */,
489491
03CF43B12CEC5CEC00C7113B /* kernels_quantized_debug */,
490492
);
@@ -1010,6 +1012,16 @@
10101012
package = 03CF43942CEC5CEC00C7113B /* XCRemoteSwiftPackageReference "executorch" */;
10111013
productName = kernels_optimized_debug;
10121014
};
1015+
03CF43AB2CEC5CEC00C7113B /* kernels_portable */ = {
1016+
isa = XCSwiftPackageProductDependency;
1017+
package = 03CF43942CEC5CEC00C7113B /* XCRemoteSwiftPackageReference "executorch" */;
1018+
productName = kernels_portable;
1019+
};
1020+
03CF43AD2CEC5CEC00C7113B /* kernels_portable_debug */ = {
1021+
isa = XCSwiftPackageProductDependency;
1022+
package = 03CF43942CEC5CEC00C7113B /* XCRemoteSwiftPackageReference "executorch" */;
1023+
productName = kernels_portable_debug;
1024+
};
10131025
03CF43AF2CEC5CEC00C7113B /* kernels_quantized */ = {
10141026
isa = XCSwiftPackageProductDependency;
10151027
package = 03CF43942CEC5CEC00C7113B /* XCRemoteSwiftPackageReference "executorch" */;

examples/demo-apps/react-native/rnllama/ios/LlamaBridge.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
#ifndef LLaMABridge_h
22
#define LLaMABridge_h
33

4-
#import <LLaMARunner/LLaMARunner.h>
54
#import <React/RCTBridgeModule.h>
65
#import <React/RCTEventEmitter.h>
6+
#import "LLaMARunner.h"
77

88
NS_ASSUME_NONNULL_BEGIN
99

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
ET_PLATFORM[sdk=iphonesimulator*] = simulator
2+
ET_PLATFORM[sdk=iphoneos*] = ios
3+
ET_PLATFORM[sdk=macos*] = macos
4+
5+
// Link the Debug version of ExecuTorch runtime to keep the logs.
6+
// Switch to Release for better performance if logs are not needed.
7+
OTHER_LDFLAGS = $(inherited) \
8+
-force_load $(BUILT_PRODUCTS_DIR)/libexecutorch_debug_$(ET_PLATFORM).a \
9+
-force_load $(BUILT_PRODUCTS_DIR)/libbackend_coreml_$(ET_PLATFORM).a \
10+
-force_load $(BUILT_PRODUCTS_DIR)/libbackend_mps_$(ET_PLATFORM).a \
11+
-force_load $(BUILT_PRODUCTS_DIR)/libbackend_xnnpack_$(ET_PLATFORM).a \
12+
-force_load $(BUILT_PRODUCTS_DIR)/libkernels_custom_$(ET_PLATFORM).a \
13+
-force_load $(BUILT_PRODUCTS_DIR)/libkernels_optimized_$(ET_PLATFORM).a \
14+
-force_load $(BUILT_PRODUCTS_DIR)/libkernels_quantized_$(ET_PLATFORM).a \
15+
@$(TEMP_DIR)/cmake/linker_flags
16+
17+
// LLaMARunner requires additional dependencies built with CMake in a custom run script phase.
18+
// Include headers and libraries from $(TEMP_DIR)/cmake for it.
19+
HEADER_SEARCH_PATHS = $(inherited) \
20+
$(SRCROOT)/../../../../../.. \
21+
$(TEMP_DIR)/cmake/include
22+
23+
LIBRARY_SEARCH_PATHS = $(inherited) \
24+
$(TEMP_DIR)/cmake/lib

0 commit comments

Comments
 (0)