Update base for Update on "[ET-VK] Introduce AOT operator registry"

SS-JIA · SS-JIA · commit 4b42871c4b7d · 2024-10-26T16:31:18.000-07:00
## Changes Move the following files to the root directory of Vulkan backend: * `backends/vulkan/partitioner/supported_ops.py` -> `backends/vulkan/op_registry.py` * `backends/vulkan/_passes/custom_ops_defs.py` -> `backends/vulkan/custom_ops_lib.py` In the new `op_registry.py` file, the way operator features are specified is reworked to provide much more detail about the features of the operator implementation in Vulkan. See the new `OpFeatures` class for more details. An example of registering a new operator to the export flow is ``` update_features( [ exir_ops.edge.aten._log_softmax.default, exir_ops.edge.aten._softmax.default, exir_ops.edge.aten.mean.dim, exir_ops.edge.aten.sum.dim_IntList, exir_ops.edge.aten.amax.default, exir_ops.edge.aten.amin.default, ] ) def register_reduce_op(features: OpFeatures): features.texture_impl = TextureImplFeatures( uses_packed_dim=True, ) features.resize_fn = True def check_reduce_node(node: torch.fx.Node) -> bool: dim_list = node.args[1] assert isinstance(dim_list, list) if len(dim_list) != 1: return False keepdim = node.args[2] assert isinstance(keepdim, bool) if not keepdim: return False return True features.check_node_fn = check_reduce_node return features ``` ## Rationale The purpose of these changes is to centralize operator definitions so that there is a common source of truth about the capabilities of operator implementation in Vulkan. This way, the partitioner does not have to implement ad-hoc functions for specific operators (i.e. `is_valid_to_copy`) and graph transforms do not have to maintain their own operator metadata (`USES_WEIGHTS` in `insert_prepack_nodes`). Differential Revision: [D64915640](https://our.internmc.facebook.com/intern/diff/D64915640/) [ghstack-poisoned]
diff --git a/.ci/docker/ci_commit_pins/pytorch.txt b/.ci/docker/ci_commit_pins/pytorch.txt
@@ -1 +1 @@
-export-D64151426
+bd5482c7c3e1197e10c46ff739027f917d9c1fcc
diff --git a/build/packaging/smoke_test.py b/build/packaging/smoke_test.py
@@ -15,6 +15,14 @@
 # will fail and the process will exit.
 from executorch.extension.pybindings import portable_lib  # usort: skip
 
+# Import custom ops. This requires portable_lib to be loaded first.
+from executorch.extension.llm.custom_ops import (  # noqa: F401, F403
+    sdpa_with_kv_cache,
+)  # usort: skip
+
+# Import quantized ops. This requires portable_lib to be loaded first.
+from executorch.kernels import quantized  # usort: skip # noqa: F401, F403
+
 # Import this after importing the ExecuTorch pybindings. If the pybindings
 # links against a different torch.so than this uses, there will be a set of
 # symbol comflicts; the process will either exit now, or there will be issues
@@ -75,6 +83,15 @@ def main():
     assert len(ops) > 0, "Empty operator list"
     print(f"Found {len(ops)} operators; first element '{ops[0]}'")
 
+    # Make sure custom ops are registered.
+    assert (
+        "llama::sdpa_with_kv_cache" in ops
+    ), f"sdpa_with_kv_cache not registered, Got ops: {ops}"
+
+    # Make sure quantized ops are registered.
+    assert (
+        "quantized_decomposed::add.out" in ops
+    ), f"quantized_decomposed::add.out not registered, Got ops: {ops}"
     # Export LinearModel to .pte data.
     pte_data: bytes = export_linear_model()
 
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
@@ -265,21 +265,22 @@ class Attention(nn.Module):
     def __init__(self, args: ModelArgs, layer_id: int):
         super().__init__()
         self.use_kv_cache = args.use_kv_cache
-        self.n_kv_heads = args.n_heads if args.n_kv_heads is None else args.n_kv_heads
-        assert args.n_heads % self.n_kv_heads == 0
+        self.n_heads = args.n_heads
+        self.n_kv_heads = self.n_heads if args.n_kv_heads is None else args.n_kv_heads
+        assert self.n_heads % self.n_kv_heads == 0
         model_parallel_size = 1
-        self.n_local_heads = args.n_heads // model_parallel_size
+        self.n_local_heads = self.n_heads // model_parallel_size
         self.n_local_kv_heads = self.n_kv_heads // model_parallel_size
         self.n_rep = self.n_local_heads // self.n_local_kv_heads
-        self.head_dim = args.dim // args.n_heads
+        self.head_dim = args.dim // self.n_heads
         self.max_batch_size = args.max_batch_size
         self.max_seq_len = args.max_seq_len
         self.dim = args.dim
-        # args.dim = 4096, args.n_heads = 32, self.head_dim = 4096 / 32 = 125
-        self.wq = nn.Linear(args.dim, args.n_heads * self.head_dim, bias=False)
-        self.wk = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wv = nn.Linear(args.dim, self.n_kv_heads * self.head_dim, bias=False)
-        self.wo = nn.Linear(args.n_heads * self.head_dim, args.dim, bias=False)
+        # self.dim = 4096, self.n_heads = 32, self.head_dim = 4096 / 32 = 125
+        self.wq = nn.Linear(self.dim, self.n_heads * self.head_dim, bias=False)
+        self.wk = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wv = nn.Linear(self.dim, self.n_kv_heads * self.head_dim, bias=False)
+        self.wo = nn.Linear(self.n_heads * self.head_dim, self.dim, bias=False)
 
         self.layer_id = layer_id
 
diff --git a/examples/models/llama/source_transformation/lora.py b/examples/models/llama/source_transformation/lora.py
@@ -70,6 +70,8 @@ def __init__(
             precision=precision,
             scales_precision=scales_precision,
         )
+        # TODO(lunwenh): Remove this once TorchAO's commit pin in ExecuTorch is updated to include this PR
+        self.zeros = torch.zeros_like(self.zeros)
         self.adaptor = LoRAAdaptorLinear(
             in_features,
             out_features,
diff --git a/examples/models/llama/source_transformation/pre_quantization.py b/examples/models/llama/source_transformation/pre_quantization.py
@@ -46,6 +46,8 @@ def replacement_fn(child: torch.nn.Module) -> torch.nn.Module:
             precision=precision,
             scales_precision=scales_precision,
         )
+        # TODO(lunwenh): Remove this once TorchAO's commit pin in ExecuTorch is updated to include this PR
+        new_linear.zeros = torch.zeros_like(new_linear.zeros)
         return new_linear
 
     _replace_with_custom_fn_if_matches_filter(module, replacement_fn, filter_fn)
diff --git a/examples/models/llama/source_transformation/quantize.py b/examples/models/llama/source_transformation/quantize.py
@@ -375,7 +375,7 @@ def __init__(
         self.in_features = in_features
         self.out_features = out_features
         self.register_buffer(
-            "weight", torch.empty((out_features, in_features), dtype=torch.int8)
+            "weight", torch.zeros((out_features, in_features), dtype=torch.int8)
         )
         self.register_buffer("scales", torch.ones(out_features, dtype=torch.bfloat16))
 
@@ -448,18 +448,18 @@ def __init__(
         # currently storing unpacked int8 weights
         self.register_buffer(
             "weight",
-            torch.empty((out_features, in_features), dtype=torch.int8),
+            torch.zeros((out_features, in_features), dtype=torch.int8),
         )
         self.register_buffer(
             "scales",
-            torch.empty(
+            torch.zeros(
                 (out_features),
                 dtype=torch.float32,
             ),
         )
         self.register_buffer(
             "zeros",
-            torch.empty(
+            torch.zeros(
                 (out_features),
                 dtype=torch.float32,
             ),
@@ -632,15 +632,15 @@ def __init__(
         if not packed:
             self.register_buffer(
                 "weight",
-                torch.empty(
+                torch.zeros(
                     (vocab_size, embedding_dim), dtype=torch.int8, device=device
                 ),
             )
         else:  # packed
             if bitwidth == 2:
                 self.register_buffer(
                     "weight",
-                    torch.empty(
+                    torch.zeros(
                         (vocab_size, embedding_dim // 4),
                         dtype=torch.uint8,
                         device=device,
@@ -649,7 +649,7 @@ def __init__(
             elif bitwidth == 4:
                 self.register_buffer(
                     "weight",
-                    torch.empty(
+                    torch.zeros(
                         (vocab_size, embedding_dim // 2),
                         dtype=torch.uint8,
                         device=device,
diff --git a/examples/qualcomm/oss_scripts/llama2/llama.py b/examples/qualcomm/oss_scripts/llama2/llama.py
@@ -564,6 +564,7 @@ def post_process():
         exit(f"Finish the running pre_gen_pte from {args.pre_gen_pte}")
 
     if args.compile_only:
+        compile(args)
         exit(f"Finish compile_only and save to {args.artifact}")
 
     try:
diff --git a/exir/program/_program.py b/exir/program/_program.py
@@ -453,7 +453,6 @@ def to_executorch(
     def __deepcopy__(
         self, memo: Optional[Dict[int, Any]] = None
     ) -> "ExirExportedProgram":
-
         new_eep = ExirExportedProgram(
             copy.deepcopy(self.exported_program, memo),
             self.after_to_edge_passes,
@@ -764,7 +763,6 @@ def _replace_aten_ops_with_transformed_ops(
     program: ExportedProgram,
     partitioner,
 ):
-
     ops_to_not_decompose = set()
     partitioners = partitioner.get(name)
     if partitioners is None:
@@ -1020,9 +1018,9 @@ def to_edge_transform_and_lower(
         aten_programs = programs
 
     if not isinstance(partitioner, dict) and partitioner is not None:
-        partitioner = {"forward": partitioner}
+        partitioner = {name: partitioner for name in aten_programs.keys()}
     elif partitioner is None:
-        partitioner = {"forward": []}
+        partitioner = {name: [] for name in aten_programs.keys()}
 
     edge_manager = _gen_edge_manager_for_partitioners(
         partitioner, aten_programs, config, constant_methods
@@ -1037,7 +1035,6 @@ def to_edge_transform_and_lower(
                 edge_manager = edge_manager.to_backend({name: curr_partitioner})
 
     for name, program in edge_manager._edge_programs.items():
-
         ops_set_to_not_decompose: Set[torch._ops.OpOverload] = set()
         partitioners = partitioner.get(name, [])
         for curr_partitioner in partitioners:
diff --git a/exir/schema.py b/exir/schema.py
@@ -35,14 +35,24 @@ class OptionalTensorList:
 
 class TensorShapeDynamism(IntEnum):
     """
-    Check schema.fbs for explanations of this enum.
+    Check program.fbs for explanations of this enum.
     """
 
     STATIC = 0
     DYNAMIC_BOUND = 1
     DYNAMIC_UNBOUND = 2
 
 
+@dataclass
+class ExtraTensorInfo:
+    """
+    Check program.fbs for explanations of this enum.
+    """
+
+    mutable_data_segments_idx: Optional[int] = None
+    fully_qualified_name: Optional[str] = None
+
+
 @dataclass
 class Tensor:
     scalar_type: ScalarType
@@ -54,8 +64,9 @@ class Tensor:
     data_buffer_idx: int
     allocation_info: Optional[AllocationDetails]
 
-    # check schema.fbs for explanations
+    # check program.fbs for explanations.
     shape_dynamism: TensorShapeDynamism
+    extra_tensor_info: Optional[ExtraTensorInfo] = None
 
 
 @dataclass
diff --git a/exir/tests/test_memory_format_ops_pass_utils.py b/exir/tests/test_memory_format_ops_pass_utils.py
@@ -69,7 +69,7 @@ class MemoryFormatOpsPassTestUtils:
     def memory_format_test_runner(
         test_class: unittest.TestCase, test_set: MemoryFormatTestSet
     ):
-        before = export(test_set.module, test_set.sample_input)
+        before = export(test_set.module, test_set.sample_input).run_decompositions({})
 
         if test_set.use_xnnpack:
             epm = to_edge_transform_and_lower(
diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py
@@ -1735,7 +1735,7 @@ def _do_checks(
         ep = export(
             m,
             (input,),
-        )
+        ).run_decompositions({})
         _do_checks(
             ep.graph_module.code,
             aten_op_str,
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
@@ -59,9 +59,7 @@ target_include_directories(custom_ops PUBLIC "${_common_include_directories}")
 target_include_directories(
   custom_ops PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../include"
 )
-target_link_libraries(
-  custom_ops PUBLIC ${custom_ops_libs} executorch_core
-)
+target_link_libraries(custom_ops PUBLIC ${custom_ops_libs} executorch_core)
 
 target_compile_options(
   custom_ops PUBLIC ${_common_compile_options} -DET_USE_THREADPOOL
@@ -74,7 +72,8 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
   find_package(Torch CONFIG REQUIRED)
   add_library(
     custom_ops_aot_lib SHARED
-    ${_custom_ops__srcs} ${CMAKE_CURRENT_SOURCE_DIR}/op_sdpa_aot.cpp
+    ${_custom_ops__srcs}
+    ${CMAKE_CURRENT_SOURCE_DIR}/op_sdpa_aot.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/op_fast_hadamard_transform_aten.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/op_tile_crop.cpp
     ${CMAKE_CURRENT_SOURCE_DIR}/op_tile_crop_aot.cpp
@@ -110,5 +109,26 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
            ${_common_compile_options} -DET_USE_THREADPOOL
   )
 
+  # pip wheels will need to be able to find the dependent libraries. On Linux,
+  # the .so has non-absolute dependencies on libs like "_portable_lib.so"
+  # without paths; as long as we `import torch` first, those dependencies will
+  # work. But Apple dylibs do not support non-absolute dependencies, so we need
+  # to tell the loader where to look for its libraries. The LC_LOAD_DYLIB
+  # entries for the portable_lib libraries will look like
+  # "@rpath/_portable_lib.cpython-310-darwin.so", so we can add an LC_RPATH
+  # entry to look in a directory relative to the installed location of our
+  # _portable_lib.so file. To see these LC_* values, run `otool -l
+  # libcustom_ops_aot_lib.dylib`.
+  if(APPLE)
+    set_target_properties(
+      custom_ops_aot_lib
+      PROPERTIES # Assume this library will be installed in
+                 # <site-packages>/executorch/extension/llm/custom_ops/, and the
+                 # _portable_lib.so is installed in
+                 # <site-packages>/executorch/extension/pybindings/
+                 BUILD_RPATH "@loader_path/../../pybindings"
+                 INSTALL_RPATH "@loader_path/../../pybindings"
+    )
+  endif()
   install(TARGETS custom_ops_aot_lib DESTINATION lib)
 endif()
diff --git a/kernels/aten/functions.yaml b/kernels/aten/functions.yaml
@@ -249,12 +249,16 @@
 
 - op: max.unary_out
 
+- op: max.unary_out
+
 - op: maximum.out
 
 - op: mean.out
 
 - op: min.dim_min
 
+- op: min.unary_out
+
 - op: minimum.out
 
 - op: mm.out
diff --git a/kernels/portable/cpu/op_max.cpp b/kernels/portable/cpu/op_max.cpp
@@ -9,14 +9,22 @@
 #include <cmath>
 #include <tuple>
 
-#include <executorch/kernels/portable/cpu/util/index_util.h>
 #include <executorch/kernels/portable/cpu/util/reduce_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 #include <executorch/runtime/platform/assert.h>
 
 namespace torch {
 namespace executor {
 namespace native {
+namespace {
+
+template <typename CTYPE>
+constexpr CTYPE lower_bound() {
+  using lim = std::numeric_limits<CTYPE>;
+  return lim::has_infinity ? -lim::infinity() : lim::lowest();
+}
+
+} // namespace
 
 using ScalarType = exec_aten::ScalarType;
 using SizesType = exec_aten::SizesType;
@@ -94,6 +102,44 @@ std::tuple<Tensor&, Tensor&> max_out(
   return {max, max_indices};
 }
 
+Tensor&
+max_unary_out(KernelRuntimeContext& ctx, const Tensor& in, Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx, resize_tensor(out, {}) == Error::Ok, InvalidArgument, out);
+
+  ET_KERNEL_CHECK(
+      ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out);
+
+  ScalarType in_type = in.scalar_type();
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(ctx, canCast(in_type, out_type), InvalidArgument, out);
+
+  constexpr auto name = "max.unary_out";
+
+  ET_SWITCH_REALHBBF16_TYPES(in_type, ctx, name, CTYPE_IN, [&] {
+    ET_SWITCH_REALHBBF16_TYPES(out_type, ctx, name, CTYPE_OUT, [&] {
+      const auto data_in = in.const_data_ptr<CTYPE_IN>();
+      auto data_out = out.mutable_data_ptr<CTYPE_OUT>();
+      data_out[0] = lower_bound<CTYPE_OUT>();
+      for (auto i = 0; i < in.numel(); ++i) {
+        CTYPE_OUT val = static_cast<CTYPE_OUT>(data_in[i]);
+        if (std::isnan(val)) {
+          data_out[0] = val;
+          break;
+        }
+        if (val > data_out[0]) {
+          data_out[0] = val;
+        }
+      }
+    });
+  });
+
+  return out;
+}
+
 } // namespace native
 } // namespace executor
 } // namespace torch
diff --git a/kernels/portable/cpu/op_min.cpp b/kernels/portable/cpu/op_min.cpp
diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml
diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt
diff --git a/kernels/test/op_max_test.cpp b/kernels/test/op_max_test.cpp
diff --git a/kernels/test/op_min_test.cpp b/kernels/test/op_min_test.cpp
diff --git a/shim/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim/xplat/executorch/kernels/portable/op_registration_util.bzl

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-export-D64151426`
	`1`	`+bd5482c7c3e1197e10c46ff739027f917d9c1fcc`
Original file line number	Diff line number	Diff line change
`@@ -70,6 +70,8 @@ def __init__(`
`70`	`70`	`precision=precision,`
`71`	`71`	`scales_precision=scales_precision,`
`72`	`72`	`)`
	`73`	`+ # TODO(lunwenh): Remove this once TorchAO's commit pin in ExecuTorch is updated to include this PR`
	`74`	`+ self.zeros = torch.zeros_like(self.zeros)`
`73`	`75`	`self.adaptor = LoRAAdaptorLinear(`
`74`	`76`	`in_features,`
`75`	`77`	`out_features,`
Original file line number	Diff line number	Diff line change
`@@ -46,6 +46,8 @@ def replacement_fn(child: torch.nn.Module) -> torch.nn.Module:`
`46`	`46`	`precision=precision,`
`47`	`47`	`scales_precision=scales_precision,`
`48`	`48`	`)`
	`49`	`+ # TODO(lunwenh): Remove this once TorchAO's commit pin in ExecuTorch is updated to include this PR`
	`50`	`+ new_linear.zeros = torch.zeros_like(new_linear.zeros)`
`49`	`51`	`return new_linear`
`50`	`52`
`51`	`53`	`_replace_with_custom_fn_if_matches_filter(module, replacement_fn, filter_fn)`