pytorch
diff --git a/‎.github/workflows/update-viablestrict.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/update-viablestrict.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/quantizer/custom_annotation.py‎
Lines changed: 33 additions & 0 deletions b/‎backends/qualcomm/quantizer/custom_annotation.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎backends/qualcomm/utils/utils.py‎
Lines changed: 10 additions & 0 deletions b/‎backends/qualcomm/utils/utils.py‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎backends/transforms/fuse_conv_with_clamp.py‎
Lines changed: 2 additions & 3 deletions b/‎backends/transforms/fuse_conv_with_clamp.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎backends/transforms/targets.bzl‎
Lines changed: 1 addition & 1 deletion b/‎backends/transforms/targets.bzl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎backends/vulkan/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/vulkan/TARGETS‎
Lines changed: 0 additions & 33 deletions b/‎backends/vulkan/TARGETS‎
Lines changed: 0 additions & 33 deletions
diff --git a/‎backends/vulkan/_passes/TARGETS‎
Lines changed: 1 addition & 26 deletions b/‎backends/vulkan/_passes/TARGETS‎
Lines changed: 1 addition & 26 deletions
diff --git a/‎backends/vulkan/_passes/insert_prepack_nodes.py‎
Lines changed: 14 additions & 27 deletions b/‎backends/vulkan/_passes/insert_prepack_nodes.py‎
Lines changed: 14 additions & 27 deletions
diff --git a/‎backends/vulkan/_passes/int4_weight_only_quantizer.py‎
Lines changed: 2 additions & 4 deletions b/‎backends/vulkan/_passes/int4_weight_only_quantizer.py‎
Lines changed: 2 additions & 4 deletions
@@ -20,7 +20,7 @@ jobs:
         with:
           repository: pytorch/executorch
           stable-branch: viable/strict
-          requires: '[\"pull\", \"lint\", \"trunk\", \"Build documentation\", \"^Android$\", \"^Apple$\"]'
+          requires: '[\"pull\", \"lint\", \"trunk\", \"Build documentation\", \"^Apple$\"]'
           secret-bot-token: ${{ secrets.UPDATEBOT_TOKEN }}
           clickhouse-url: ${{ secrets.CLICKHOUSE_URL }}
           clickhouse-username: ${{ secrets.CLICKHOUSE_VIABLESTRICT_USERNAME }}
 
@@ -12,6 +12,7 @@
     QuantizationConfig,
 )
 from executorch.backends.qualcomm.quantizer.utils import QUANT_ANNOTATION_KEY
+from executorch.exir.dialects._ops import ops as exir_ops
 from torch.ao.quantization.quantizer import (
     QuantizationAnnotation,
     SharedQuantizationSpec,
@@ -144,3 +145,35 @@ def annotate_matmul(node: Node, quantization_config: QuantizationConfig):
     for node in gm.graph.nodes:
         if node.op == "call_function" and node.target == torch.ops.aten.matmul.default:
             annotate_matmul(node, quantization_config_16a8w)
+
+
+def get_custom_quant_ios_dtype(
+    cache_shape: torch.Size,
+    node: torch.fx.Node,
+    kv_dtype=torch.uint8,
+    sharding_dtype=torch.uint16,
+):
+    """
+    This function is specific for llama inputs and outputs
+    """
+    if node.op == "placeholder" and "attention_sdpa_kv_cache_past_" in node.name:
+        return kv_dtype
+
+    # Tag index put node before copy node, because copy is a skipped node in qnn
+    if (
+        exir_ops.edge.aten.index_put.default == node.target
+        and node.meta["val"].shape == cache_shape
+    ):
+        return kv_dtype
+
+    # Tag sharding io
+    if exir_ops.edge.llama.fallback.default in [
+        u.target for u in list(node.users.keys())
+    ] + [node.target]:
+        return sharding_dtype
+
+    # Tag index op as quantized tensors. It is caused by sharding
+    if exir_ops.edge.aten.index.Tensor in [
+        u.target for u in list(node.users.keys())
+    ] + [node.target]:
+        return sharding_dtype
@@ -71,6 +71,7 @@
     QCOM_PASS_EXPAND_BROADCAST_SHAPE,
     QCOM_PASS_SKIP_ADVANCED_REQUANT,
     QCOM_QNN_COMPILE_SPEC,
+    QCOM_QUANTIZED_IO,
 )
 
 from executorch.exir import ExirExportedProgram
@@ -876,3 +877,12 @@ def get_soc_to_chipset_map():
         "SM8475": QcomChipset.SM8475,
         "SM8450": QcomChipset.SM8450,
     }
+
+
+def tag_quant_io(gm: torch.fx.GraphModule, get_quant_io_dtype_fn: Callable):
+    """
+    Tag io nodes which get/output quantized tensor. No need to insert q/dq in qnn_preprocess
+    """
+    for node in gm.graph.nodes:
+        if dtype := get_quant_io_dtype_fn(node):
+            node.meta[QCOM_QUANTIZED_IO] = dtype
@@ -6,10 +6,9 @@
 
 import sys
 
+import executorch.backends.vulkan.custom_ops_lib  # noqa
+
 import torch
-from executorch.backends.vulkan._passes.custom_ops_defs import (  # noqa
-    conv_with_clamp_op,
-)
 
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
@@ -70,7 +70,7 @@ def define_common_targets():
         deps = [
             ":utils",
             "//caffe2:torch",
-            "//executorch/backends/vulkan/_passes:custom_ops_defs",
+            "//executorch/backends/vulkan:custom_ops_lib",
             "//executorch/exir:pass_base",
             "//executorch/exir:sym_util",
             "//executorch/exir/dialects:lib",
 
@@ -83,14 +83,14 @@ set(vulkan_standard_shaders_cpp ${generated_spv_cpp})
 set(SCHEMA_INCLUDE_DIR ${CMAKE_BINARY_DIR}/schema/include)
 
 set(GENERATED_HEADER
-    ${SCHEMA_INCLUDE_DIR}/executorch/backends/vulkan/schema_generated.h
+    ${SCHEMA_INCLUDE_DIR}/executorch/backends/vulkan/serialization/schema_generated.h
 )
 
 add_custom_command(
   OUTPUT ${GENERATED_HEADER}
   COMMAND
     ${FLATC_EXECUTABLE} --cpp --cpp-std c++11 --scoped-enums -o
-    "${SCHEMA_INCLUDE_DIR}/executorch/backends/vulkan/" ${_vulkan_schema__srcs}
+    "${SCHEMA_INCLUDE_DIR}/executorch/backends/vulkan/serialization/" ${_vulkan_schema__srcs}
   WORKING_DIRECTORY ${EXECUTORCH_ROOT}
   COMMENT "Generating vulkan_schema headers"
   VERBATIM
 
@@ -1,37 +1,4 @@
-load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 load(":targets.bzl", "define_common_targets")
-
 oncall("executorch")
 
 define_common_targets(is_fbcode = True)
-
-runtime.python_library(
-    name = "vulkan_preprocess",
-    srcs = [
-        "serialization/vulkan_graph_builder.py",
-        "serialization/vulkan_graph_schema.py",
-        "serialization/vulkan_graph_serialize.py",
-        "vulkan_preprocess.py",
-    ],
-    resources = [
-        "serialization/schema.fbs",
-    ],
-    visibility = [
-        "//executorch/...",
-        "//executorch/vulkan/...",
-        "@EXECUTORCH_CLIENTS",
-    ],
-    deps = [
-        "//executorch/backends/transforms:addmm_mm_to_linear",
-        "//executorch/backends/transforms:fuse_batch_norm_with_conv",
-        "//executorch/backends/transforms:fuse_conv_with_clamp",
-        "//executorch/backends/transforms:fuse_dequant_linear",
-        "//executorch/backends/transforms:fuse_view_copy",
-        "//executorch/backends/transforms:remove_clone_ops",
-        "//executorch/backends/vulkan/_passes:vulkan_passes",
-        "//executorch/exir:graph_module",
-        "//executorch/exir/_serialize:_bindings",
-        "//executorch/exir/_serialize:lib",
-        "//executorch/exir/backend:backend_details",
-    ],
-)
@@ -3,31 +3,6 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 oncall("executorch")
 
-runtime.python_library(
-    name = "custom_ops_defs",
-    srcs = [
-        "custom_ops_defs.py",
-    ],
-    visibility = [
-        "//executorch/...",
-        "@EXECUTORCH_CLIENTS",
-    ],
-    deps = [
-        "//caffe2:torch",
-    ],
-)
-
-python_unittest(
-    name = "test_custom_ops",
-    srcs = [
-        "test_custom_ops.py",
-    ],
-    deps = [
-        ":custom_ops_defs",
-        "//caffe2:torch",
-    ],
-)
-
 runtime.python_library(
     name = "insert_prepack_nodes",
     srcs = ["insert_prepack_nodes.py"],
@@ -62,7 +37,7 @@ runtime.python_library(
         "//executorch/backends/...",
     ],
     deps = [
-        ":custom_ops_defs",
+        "//executorch/backends/vulkan:custom_ops_lib",
         "//pytorch/ao:torchao",
     ]
 )
 
@@ -6,39 +6,27 @@
 
 # pyre-strict
 
-from typing import List
-
-import executorch.backends.vulkan._passes.custom_ops_defs  # noqa
+import executorch.backends.vulkan.custom_ops_lib  # noqa
 
 import torch
 
+from executorch.backends.vulkan.op_registry import handles_own_prepacking
+
 from executorch.exir.dialects._ops import ops as exir_ops
 
 from torch._export.utils import is_buffer, is_param
 from torch.export import ExportedProgram
 
-USES_WEIGHTS: List[torch._ops.OpOverload] = [
-    exir_ops.edge.aten.embedding.default,
-    exir_ops.edge.aten.convolution.default,
-    exir_ops.edge.et_vk.conv_with_clamp.default,
-    exir_ops.edge.aten.linear.default,
-    exir_ops.edge.aten._weight_int8pack_mm.default,
-    exir_ops.edge.et_vk.linear_weight_int4.default,
-    exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
-    exir_ops.edge.aten.native_layer_norm.default,
-    "llama::sdpa_with_kv_cache",
-]
-
 
 def insert_prepack_nodes(program: ExportedProgram) -> ExportedProgram:
     """
     Insert `et_vk.prepack` nodes for constant tensors in the graph. The prepack operator
     is responsible for transferring the tensor data, which is serialized with the model,
     to a GPU tensor object during the prepacking stage of model execution.
 
-    Some operators, listed in `USES_WEIGHTS` above, are performance sensitive and will
-    prefer to handle prepacking within the operator. For these ops, the constant tensor
-    data will be passed directly as an argument into the operator implementation.
+    Some operators are performance sensitive and will prefer to handle prepacking within
+    the operator. For these ops, the constant tensor data will be passed directly as an
+    argument into the operator implementation.
     """
 
     def is_get_attr_node(node: torch.fx.Node) -> bool:
@@ -58,22 +46,21 @@ def is_param_node(node: torch.fx.Node) -> bool:
             or is_constant(node)
         )
 
-    def is_non_weight_param_tensor(node: torch.fx.Node) -> bool:
+    def prepack_not_required(node: torch.fx.Node) -> bool:
         if not is_param_node(node):
-            return False
+            return True
 
         for user in node.users:
-            if user.op == "call_function" and (
-                # pyre-ignore [16]
-                user.target in USES_WEIGHTS
-                or user.target.name() in USES_WEIGHTS
+            if user.op == "call_function" and handles_own_prepacking(
+                # pyre-ignore
+                user.target
             ):
-                return False
+                return True
 
-        return True
+        return False
 
     for node in program.graph_module.graph.nodes:
-        if not is_non_weight_param_tensor(node):
+        if prepack_not_required(node):
             continue
 
         with program.graph_module.graph.inserting_after(node):
 
@@ -1,13 +1,11 @@
 import logging
 from typing import Any, Callable, Dict, Optional, Type
 
+import executorch.backends.vulkan.custom_ops_lib  # noqa
+
 import torch
 import torch.nn.functional as F
 
-from executorch.backends.vulkan._passes.custom_ops_defs import (  # noqa
-    linear_weight_int4_op,
-)
-
 from torchao.quantization.GPTQ import _check_linear_int4_k
 from torchao.quantization.unified import Quantizer
 from torchao.quantization.utils import groupwise_affine_quantize_tensor