Update on "[ET-VK] Only save_cache the first time"

jorgep31415 · jorgep31415 · commit 0836968a72ec · 2024-11-20T09:24:23.000-08:00
Add a check in `save_cache` to return early if the cache file already exists. Currently we append the same cache data to that file which makes no difference to model-load time. Differential Revision: [D66179919](https://our.internmc.facebook.com/intern/diff/D66179919/) [ghstack-poisoned]
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
@@ -136,7 +136,7 @@ jobs:
       fail-fast: false
     with:
       runner: linux.4xlarge
-      docker-image: executorch-ubuntu-22.04-clang12-android
+      docker-image: executorch-ubuntu-22.04-qnn-sdk
       submodules: 'true'
       timeout: 60
       upload-artifact: android-models
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -302,7 +302,7 @@ jobs:
       fail-fast: false
     with:
       runner: linux.2xlarge
-      docker-image: executorch-ubuntu-22.04-clang12-android
+      docker-image: executorch-ubuntu-22.04-qnn-sdk
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
       timeout: 900
diff --git a/.gitmodules b/.gitmodules
@@ -1,9 +1,9 @@
 [submodule "backends/arm/third-party/ethos-u-core-driver"]
 	path = backends/arm/third-party/ethos-u-core-driver
-	url = https://review.mlplatform.org/ml/ethos-u/ethos-u-core-driver
+	url = https://git.mlplatform.org/ml/ethos-u/ethos-u-core-driver.git/
 [submodule "backends/arm/third-party/serialization_lib"]
 	path = backends/arm/third-party/serialization_lib
-	url = https://review.mlplatform.org/tosa/serialization_lib
+	url = https://git.mlplatform.org/tosa/serialization_lib.git/
 [submodule "backends/vulkan/third-party/Vulkan-Headers"]
 	path = backends/vulkan/third-party/Vulkan-Headers
 	url = https://github.com/KhronosGroup/Vulkan-Headers
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
@@ -38,6 +38,7 @@ python_library(
     deps = [
         ":passes",
         ":utils",
+        ":ops_registrations",
         "//caffe2:torch",
         "//executorch/backends/cadence/aot/quantizer:fusion_pass",
         "//executorch/backends/cadence/aot/quantizer:quantizer",
@@ -71,6 +72,8 @@ python_library(
     ],
     deps = [
         ":utils",
+        ":fuse_ops",
+        ":simplify_ops",
         "//caffe2:torch",
         "//executorch/exir:pass_base",
         "//executorch/exir/dialects:lib",
@@ -163,6 +166,20 @@ python_library(
     ],
 )
 
+python_library(
+    name = "simplify_ops",
+    srcs = [
+        "simplify_ops.py",
+    ],
+    typing = True,
+    deps = [
+        ":pass_utils",
+        "//executorch/backends/cadence/aot:pass_utils",
+        "//executorch/exir:pass_base",
+        "//executorch/exir/dialects:lib",
+    ],
+)
+
 python_unittest(
     name = "test_graph_builder",
     srcs = [
diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py
@@ -10,6 +10,7 @@
 from pathlib import Path
 from typing import Callable, cast, Optional
 
+import executorch.backends.cadence.aot.ops_registrations  # noqa
 import torch
 
 from executorch.backends.cadence.aot.passes import ReplaceSafeSoftmaxWithSoftmax
diff --git a/backends/cadence/aot/fuse_ops.py b/backends/cadence/aot/fuse_ops.py
@@ -1022,7 +1022,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         return PassResult(graph_module, True)
 
 
-class FuseOpsInGraph:
+class CadenceFuseOpsInGraph:
     passes = [
         FuseMMWithAdd,
         FuseBatchNormWithConv,
diff --git a/backends/cadence/aot/passes.py b/backends/cadence/aot/passes.py
@@ -11,11 +11,13 @@
 import torch
 import torch.fx
 import torch.utils._pytree as pytree
+from executorch.backends.cadence.aot.fuse_ops import CadenceFuseOpsInGraph
 from executorch.backends.cadence.aot.pass_utils import (
     CadencePassAttribute,
     create_cadence_pass_filter,
     register_cadence_pass,
 )
+from executorch.backends.cadence.aot.simplify_ops import CadenceSimplifyOpsInGraph
 from executorch.backends.cadence.aot.utils import get_edge_overload_packet
 from executorch.backends.transforms.remove_clone_ops import RemoveCloneOpsTransform
 from executorch.exir.dialects._ops import ops as exir_ops
@@ -346,10 +348,23 @@ def get_passes_in_default_order() -> List[Type[PassType]]:
         ReplaceScalarTensorWithFullPass,
         RemoveCloneOpsTransformImported,
         RemoveNopExpandOpPass,
+        CadenceFuseOpsInGraph.passes,
         ReplaceSqueezeAndUnsqueezeWithViewPass,
         ReplacePT2QuantWithCadenceQuantPass,
         ReplacePT2DequantWithCadenceDequantPass,
+        CadenceSimplifyOpsInGraph.passes,
         # TODO: add the rest of the passes here.
+        # InitializePipeline,
+        # RemoveRedundantOps.passes,
+        # ReorderOpsInGraph.passes,
+        # RemoveJarvisNops.passes,
+        # CadenceFuseOpsInGraph.passes,
+        # ReplaceOpsInGraph.passes,
+        # SimplifyOpsInGraph.passes,
+        # FinalizePipeline,
+        # FuseFullThenReshapePass,
+        # FuseTransposeOpPairsPass,
+        # RemoveNopSliceOrViewOpPass,
     ]
     return pytree.tree_flatten(passes)[0]
 
diff --git a/backends/cadence/aot/simplify_ops.py b/backends/cadence/aot/simplify_ops.py
@@ -0,0 +1,112 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+# pyre-unsafe
+
+
+# This file contains all the functions that simplify args of an op
+
+import sys
+from typing import Optional
+
+from executorch.backends.cadence.aot.pass_utils import (
+    CadencePassAttribute,
+    register_cadence_pass,
+)
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, ProxyValue
+
+
+@register_cadence_pass(CadencePassAttribute(opt_level=0))
+class SimplifySliceOpPass(ExportPass):
+    """
+    Simplify the start and end indices of slice and slice_scatter ops.
+    """
+
+    def adjust_slice_range(
+        self,
+        length: int,
+        start: Optional[int] = None,
+        end: Optional[int] = None,
+        step: int = 1,
+    ) -> tuple[int, int]:
+        # Get the start index and end index
+        start_val = start if start is not None else 0
+        end_val = end if end is not None else sys.maxsize  # 2^63 – 1
+
+        # If start_val and end_val are negative, add length to them
+        if start_val < 0:
+            start_val += length
+        if end_val < 0:
+            end_val += length
+
+        # If the start val is still outside the tensor_size along the sliced
+        # dimension, adjust it accordingly.
+        if start_val < 0:
+            start_val = 0
+        elif start_val >= length:
+            start_val = length
+
+        # If the end val is still outside the tensor_size along the sliced
+        # dimension, adjust it accordingly.
+        if end_val < start_val:
+            end_val = start_val
+        elif end_val >= length:
+            end_val = length
+
+        # Return the adjusted start and end indices
+        return (start_val, end_val)
+
+    def call_operator(self, op, args, kwargs, meta):
+        # We are only interested in slice_copy or slice_scatter ops
+        if op not in {
+            exir_ops.edge.aten.slice_copy.Tensor,
+            exir_ops.edge.aten.slice_scatter.default,
+        }:
+            return super().call_operator(op, args, kwargs, meta)
+
+        # Check if it is a slice_scatter op or not. The slice_scatter op has
+        # an extra src argument at index 1.
+        slice_scatter = op == exir_ops.edge.aten.slice_scatter.default
+        # Parse the arguments
+        # Extract the tensor to be sliced, and the slicing dimension
+        in_tensor = args[0].to_tensor() if isinstance(args[0], ProxyValue) else args[0]
+        dim = args[1 + slice_scatter] if len(args) > 1 + slice_scatter else 0
+        # Make dim non-negative
+        dim = dim if dim >= 0 else dim + in_tensor.dim()
+        length = in_tensor.size(dim)
+
+        # Get the adjusted start and end indices
+        start_val = args[2 + slice_scatter] if len(args) > 2 + slice_scatter else None
+        end_val = args[3 + slice_scatter] if len(args) > 3 + slice_scatter else None
+        step = args[4 + slice_scatter] if len(args) > 4 + slice_scatter else 1
+        (start_val, end_val) = self.adjust_slice_range(length, start_val, end_val, step)
+
+        # If the start_val is geq end_val, then we can return an empty tensor
+        # for slice op, or input for slice_scatter op.
+        if start_val >= end_val and slice_scatter:
+            return args[0]
+        if start_val >= end_val:
+            empty_shape = [x for x in in_tensor.shape if x != 0]
+            empty_shape[dim] = 0
+            return super().call_operator(
+                exir_ops.edge.aten.full.default,
+                (tuple(empty_shape), 0),
+                {"dtype": in_tensor.dtype},
+                meta,
+            )
+
+        # Create new args
+        new_args = (
+            (args[0],)
+            + ((args[1],) if slice_scatter else ())
+            + (dim, start_val, end_val, step)
+        )
+        return super().call_operator(op, new_args, kwargs, meta)
+
+
+# This class encapsulates all the functions that simplify the op's args
+class CadenceSimplifyOpsInGraph:
+    passes = [
+        SimplifySliceOpPass,
+    ]
diff --git a/backends/qualcomm/quantizer/custom_annotation.py b/backends/qualcomm/quantizer/custom_annotation.py
@@ -14,13 +14,94 @@
     QuantizationConfig,
 )
 from executorch.exir.dialects._ops import ops as exir_ops
+from torch.ao.quantization.observer import MinMaxObserver
 from torch.ao.quantization.quantizer import (
     QuantizationAnnotation,
     SharedQuantizationSpec,
 )
 from torch.fx import Node
 
 
+def annotate_matmul_16a8w(gm: torch.fx.GraphModule) -> None:
+    """
+    This function is specific for matmul op 16a8w.
+    """
+
+    def annotate_matmul(node: Node, quantization_config: QuantizationConfig):
+        input_qspec_map = {}
+        input_act = node.args[0]
+        input_spec = quantization_config.input_activation
+        input_qspec_map[input_act] = input_spec
+
+        input_act1 = node.args[1]
+        input_spec1 = quantization_config.weight
+        input_qspec_map[input_act1] = input_spec1
+
+        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=quantization_config.output_activation,
+            _annotated=True,
+        )
+
+    def annotate_cat(node: Node, quantization_config: QuantizationConfig):
+        input_nodes = node.args[0]
+
+        first_input_node = input_nodes[0]
+        input_qspec_map = {}
+        input_qspec_map[first_input_node] = quantization_config.input_activation
+        share_qparams_with_input_act0_qspec = SharedQuantizationSpec(
+            (first_input_node, node)
+        )
+
+        for input_node in input_nodes[1:]:
+            if input_node not in input_qspec_map:
+                input_qspec_map[input_node] = share_qparams_with_input_act0_qspec
+
+        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=share_qparams_with_input_act0_qspec,
+            _annotated=True,
+        )
+
+    def annotate_single_in_single_out(
+        node: Node, quantization_config: QuantizationConfig
+    ) -> None:
+
+        input_qspec_map = {}
+        input_act = node.args[0]
+        input_qspec_map[input_act] = quantization_config.input_activation
+
+        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=quantization_config.output_activation,
+            _annotated=True,
+        )
+
+    def annotate_matmul_input1(node: Node):
+        quantization_config_8a8w = get_default_8bit_qnn_ptq_config(
+            act_symmetric=True, act_observer=MinMaxObserver
+        )
+        while isinstance(node, Node) and node.op == "call_function":
+            if node.target in [
+                torch.ops.aten.permute.default,
+                torch.ops.aten.transpose.int,
+            ]:
+                annotate_single_in_single_out(node, quantization_config_8a8w)
+                node = node.args[0]
+            elif node.target == torch.ops.aten.cat.default:
+                annotate_cat(node, quantization_config_8a8w)
+                node = node.args[0][0]
+            else:
+                node = node.args[0]
+
+    quantization_config_16a8w = get_16a8w_qnn_ptq_config(act_observer=MinMaxObserver)
+
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and node.target == torch.ops.aten.matmul.default:
+            annotate_matmul(node, quantization_config_16a8w)
+            annotate_matmul_input1(node.args[1])
+
+
 def custom_annotate_llama_matmul_16a8w(gm: torch.fx.GraphModule) -> None:  # noqa: C901
     """
     This function is specific for llama matmul op 16a8w.
diff --git a/devtools/bundled_program/schema/scalar_type.fbs b/devtools/bundled_program/schema/scalar_type.fbs
@@ -18,13 +18,19 @@ enum ScalarType : byte {
   FLOAT = 6,
   DOUBLE = 7,
   BOOL = 11,
-  // TODO(jakeszwe): Verify these are unused and then remove support
   QINT8 = 12,
   QUINT8 = 13,
   QINT32 = 14,
   QUINT4X2 = 16,
   QUINT2X4 = 17,
   BITS16 = 22,
+  FLOAT8E5M2 = 23,
+  FLOAT8E4M3FN = 24,
+  FLOAT8E5M2FNUZ = 25,
+  FLOAT8E4M3FNUZ = 26,
+  UINT16 = 27,
+  UINT32 = 28,
+  UINT64 = 29,
   // Types currently not implemented.
   // COMPLEXHALF = 8,
   // COMPLEXFLOAT = 9,
diff --git a/devtools/etdump/etdump_flatcc.cpp b/devtools/etdump/etdump_flatcc.cpp
@@ -57,6 +57,8 @@ executorch_flatbuffer_ScalarType_enum_t get_flatbuffer_scalar_type(
       return executorch_flatbuffer_ScalarType_BOOL;
     case exec_aten::ScalarType::Bits16:
       return executorch_flatbuffer_ScalarType_BITS16;
+    case exec_aten::ScalarType::UInt16:
+      return executorch_flatbuffer_ScalarType_UINT16;
     default:
       ET_CHECK_MSG(
           0,
diff --git a/devtools/etdump/scalar_type.fbs b/devtools/etdump/scalar_type.fbs
@@ -18,13 +18,19 @@ enum ScalarType : byte {
   FLOAT = 6,
   DOUBLE = 7,
   BOOL = 11,
-  // TODO(jakeszwe): Verify these are unused and then remove support
   QINT8 = 12,
   QUINT8 = 13,
   QINT32 = 14,
   QUINT4X2 = 16,
   QUINT2X4 = 17,
   BITS16 = 22,
+  FLOAT8E5M2 = 23,
+  FLOAT8E4M3FN = 24,
+  FLOAT8E5M2FNUZ = 25,
+  FLOAT8E4M3FNUZ = 26,
+  UINT16 = 27,
+  UINT32 = 28,
+  UINT64 = 29,
   // Types currently not implemented.
   // COMPLEXHALF = 8,
   // COMPLEXFLOAT = 9,
diff --git a/docs/source/apple-runtime.md b/docs/source/apple-runtime.md
diff --git a/examples/qualcomm/oss_scripts/llama2/model/static_llama.py b/examples/qualcomm/oss_scripts/llama2/model/static_llama.py
diff --git a/examples/qualcomm/oss_scripts/llama3_2/llama.py b/examples/qualcomm/oss_scripts/llama3_2/llama.py
diff --git a/exir/scalar_type.py b/exir/scalar_type.py
diff --git a/exir/tensor.py b/exir/tensor.py
diff --git a/schema/scalar_type.fbs b/schema/scalar_type.fbs