pytorch
diff --git a/‎backends/transforms/fuse_dequant_linear.py‎
Lines changed: 0 additions & 77 deletions b/‎backends/transforms/fuse_dequant_linear.py‎
Lines changed: 0 additions & 77 deletions
diff --git a/‎backends/transforms/targets.bzl‎
Lines changed: 0 additions & 15 deletions b/‎backends/transforms/targets.bzl‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎backends/vulkan/_passes/TARGETS‎
Lines changed: 17 additions & 0 deletions b/‎backends/vulkan/_passes/TARGETS‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎backends/vulkan/_passes/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎backends/vulkan/_passes/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎backends/vulkan/_passes/fuse_quantized_ops.py‎
Lines changed: 110 additions & 0 deletions b/‎backends/vulkan/_passes/fuse_quantized_ops.py‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎backends/vulkan/targets.bzl‎
Lines changed: 1 addition & 1 deletion b/‎backends/vulkan/targets.bzl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/vulkan/test/TARGETS‎
Lines changed: 13 additions & 0 deletions b/‎backends/vulkan/test/TARGETS‎
Lines changed: 13 additions & 0 deletions
@@ -77,21 +77,6 @@ def define_common_targets():
         ],
     )
 
-    runtime.python_library(
-        name = "fuse_dequant_linear",
-        srcs = ["fuse_dequant_linear.py"],
-        visibility = [
-            "//executorch/backends/...",
-        ],
-        deps = [
-            ":utils",
-            "//caffe2:torch",
-            "//executorch/exir:pass_base",
-            "//executorch/exir:sym_util",
-            "//executorch/exir/dialects:lib",
-        ],
-    )
-
     runtime.python_library(
         name = "view_copy_to_squeeze_unsqueeze",
         srcs = ["view_copy_to_squeeze_unsqueeze.py"],
 
@@ -3,6 +3,21 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
 
 oncall("executorch")
 
+runtime.python_library(
+    name = "fuse_quantized_ops",
+    srcs = ["fuse_quantized_ops.py"],
+    visibility = [
+        "//executorch/backends/...",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/vulkan:utils_lib",
+        "//executorch/exir:pass_base",
+        "//executorch/exir:sym_util",
+        "//executorch/exir/dialects:lib",
+    ],
+)
+
 runtime.python_library(
     name = "insert_prepack_nodes",
     srcs = ["insert_prepack_nodes.py"],
@@ -13,6 +28,7 @@ runtime.python_library(
         "//caffe2:torch",
         "//executorch/exir:pass_base",
         "//executorch/backends/vulkan:utils_lib",
+        "//executorch/backends/vulkan:op_registry",
     ],
 )
 
@@ -110,6 +126,7 @@ runtime.python_library(
         "//executorch/examples/...",
     ],
     deps = [
+        ":fuse_quantized_ops",
         ":insert_prepack_nodes",
         ":int4_weight_only_quantizer",
         ":remove_asserts",
 
@@ -6,6 +6,9 @@
 
 # pyre-strict
 
+from executorch.backends.vulkan._passes.fuse_quantized_ops import (
+    FuseQuantizedOpsTransform,
+)
 from executorch.backends.vulkan._passes.insert_prepack_nodes import insert_prepack_nodes
 from executorch.backends.vulkan._passes.int4_weight_only_quantizer import (
     VkInt4WeightOnlyQuantizer,
@@ -26,6 +29,7 @@
 from executorch.backends.vulkan._passes.tag_memory_meta_pass import TagMemoryMetaPass
 
 __all__ = [
+    "FuseQuantizedOpsTransform",
     "insert_prepack_nodes",
     "VkInt4WeightOnlyQuantizer",
     "remove_asserts",
 
@@ -0,0 +1,110 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+import executorch.backends.vulkan.utils as utils
+import torch
+
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+#############################
+## aten.weight_int8pack_mm ##
+#############################
+
+
+def matches_int8pack_mm_pattern(node: torch.fx.Node) -> bool:
+    if not utils.is_linear_node(node):
+        return False
+
+    input_node = node.args[0]
+    weight_node = node.args[1]
+
+    # Type checking
+    if not isinstance(weight_node, torch.fx.Node):
+        return False
+    if not isinstance(input_node, torch.fx.Node):
+        return False
+
+    # The weight arg should be a dequant node dequantizing the quantized weight
+    # Furthermore, the op expects per channel quantization of the weight
+    if not utils.is_dequant_per_channel_node(weight_node):
+        return False
+
+    orig_weight = weight_node.args[0]
+    if not isinstance(orig_weight, torch.fx.Node):
+        return False
+
+    # The quantized weight data should be a int8 tensor
+    if orig_weight.meta["val"].dtype != torch.int8:
+        return False
+
+    # The input arg should not be a dequant node
+    if utils.is_dequant_node(input_node):
+        return False
+
+    return True
+
+
+def fuse_into_weight_int8pack_mm_node(
+    graph_module: torch.fx.GraphModule,
+    linear_node: torch.fx.Node,
+) -> None:
+    """
+    The weight_int8pack_mm operator represents a weight only quantized linear operator.
+    After the PT2E quantization flow, the expected graph pattern is
+
+        dq_weight = dequantize(weight, scales)
+        out = linear(activation, dq_weight, bias?)
+
+    The goal of this function is to condense that sequence into
+
+        out = weight_int8pack_mm(activation, dq_weight, scales)
+        out = out + bias
+    """
+    activation = linear_node.args[0]
+    dq_weight_node = linear_node.args[1]
+    assert isinstance(activation, torch.fx.Node)
+    assert isinstance(dq_weight_node, torch.fx.Node)
+
+    bias = None
+    if len(linear_node.args) > 2:
+        bias = linear_node.args[2]
+        assert isinstance(bias, torch.fx.Node)
+
+    orig_weight = dq_weight_node.args[0]
+    scale = dq_weight_node.args[1]
+
+    with graph_module.graph.inserting_before(linear_node):
+        weight_int8pack_mm_node = graph_module.graph.create_node(
+            "call_function",
+            exir_ops.edge.aten._weight_int8pack_mm.default,
+            (activation, orig_weight, scale),
+        )
+        if bias:
+            add_node = graph_module.graph.create_node(
+                "call_function",
+                exir_ops.edge.aten.add.Tensor,
+                (weight_int8pack_mm_node, bias),
+            )
+            linear_node.replace_all_uses_with(add_node)
+        else:
+            linear_node.replace_all_uses_with(weight_int8pack_mm_node)
+        graph_module.graph.erase_node(linear_node)
+        graph_module.graph.erase_node(dq_weight_node)
+
+
+class FuseQuantizedOpsTransform(ExportPass):
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            if matches_int8pack_mm_pattern(node):
+                fuse_into_weight_int8pack_mm_node(graph_module, node)
+
+        graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, True)
@@ -280,6 +280,7 @@ def define_common_targets(is_fbcode = False):
             deps = [
                 "//caffe2:torch",
                 "//executorch/exir:tensor",
+                "//executorch/exir/backend/canonical_partitioners:config_partitioner_lib",
                 "//executorch/backends/vulkan/serialization:lib",
             ]
         )
@@ -332,7 +333,6 @@ def define_common_targets(is_fbcode = False):
                 "//executorch/backends/transforms:addmm_mm_to_linear",
                 "//executorch/backends/transforms:fuse_batch_norm_with_conv",
                 "//executorch/backends/transforms:fuse_conv_with_clamp",
-                "//executorch/backends/transforms:fuse_dequant_linear",
                 "//executorch/backends/transforms:fuse_view_copy",
                 "//executorch/backends/transforms:remove_clone_ops",
                 "//executorch/backends/transforms:view_copy_to_squeeze_unsqueeze",
 
@@ -24,6 +24,19 @@ python_unittest(
     ],
 )
 
+python_unittest(
+    name = "test_vulkan_passes",
+    srcs = [
+        "test_vulkan_passes.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/backends/vulkan/_passes:vulkan_passes",
+        "//executorch/backends/vulkan/quantizer:vulkan_quantizer",
+        "//executorch/backends/vulkan:vulkan_preprocess",
+    ]
+)
+
 python_unittest(
     name = "test_vulkan_delegate_header",
     srcs = [