pytorch
diff --git a/‎backends/aoti/targets.bzl‎
Lines changed: 3 additions & 3 deletions b/‎backends/aoti/targets.bzl‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/apple/coreml/TARGETS‎
Lines changed: 7 additions & 24 deletions b/‎backends/apple/coreml/TARGETS‎
Lines changed: 7 additions & 24 deletions
diff --git a/‎backends/apple/mps/TARGETS‎
Lines changed: 2 additions & 6 deletions b/‎backends/apple/mps/TARGETS‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎backends/apple/mps/targets.bzl‎
Lines changed: 1 addition & 10 deletions b/‎backends/apple/mps/targets.bzl‎
Lines changed: 1 addition & 10 deletions
diff --git a/‎backends/arm/_passes/decompose_layernorm_pass.py‎
Lines changed: 5 additions & 0 deletions b/‎backends/arm/_passes/decompose_layernorm_pass.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/arm/runtime/targets.bzl‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/runtime/targets.bzl‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/test/targets.bzl‎
Lines changed: 1 addition & 1 deletion b/‎backends/arm/test/targets.bzl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/cadence/aot/reorder_ops.py‎
Lines changed: 23 additions & 11 deletions b/‎backends/cadence/aot/reorder_ops.py‎
Lines changed: 23 additions & 11 deletions
diff --git a/‎backends/cadence/aot/tests/test_reorder_ops_passes.py‎
Lines changed: 28 additions & 14 deletions b/‎backends/cadence/aot/tests/test_reorder_ops_passes.py‎
Lines changed: 28 additions & 14 deletions
diff --git a/‎backends/cadence/fusion_g3/operators/targets.bzl‎
Lines changed: 1 addition & 4 deletions b/‎backends/cadence/fusion_g3/operators/targets.bzl‎
Lines changed: 1 addition & 4 deletions
@@ -49,7 +49,7 @@ def define_common_targets():
         supports_python_dlopen = True,
         # Constructor needed for backend registration.
         compiler_flags = ["-Wno-global-constructors"],
-        visibility = ["@EXECUTORCH_CLIENTS"],
+        visibility = ["PUBLIC"],
         deps = [
             "//executorch/runtime/core:core",
             "//executorch/runtime/core/exec_aten:lib",
@@ -67,7 +67,7 @@ def define_common_targets():
         supports_python_dlopen = True,
         # Constructor needed for backend registration.
         compiler_flags = ["-Wno-global-constructors"],
-        visibility = ["@EXECUTORCH_CLIENTS"],
+        visibility = ["PUBLIC"],
         deps = [
             "//executorch/runtime/backend:interface",
             "//executorch/runtime/core:core",
@@ -80,7 +80,7 @@ def define_common_targets():
         # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
         link_whole = True,
         supports_python_dlopen = True,
-        visibility = ["@EXECUTORCH_CLIENTS"],
+        visibility = ["PUBLIC"],
         exported_deps = [
             ":common_shims",
             ":delegate_handle",
 
@@ -8,9 +8,7 @@ oncall("executorch")
 # TODO: this is a placeholder to support internal fbcode build. We should add the coreml backend target properly.
 runtime.python_library(
     name = "coreml",
-    visibility = [
-        "@EXECUTORCH_CLIENTS",
-    ],
+    visibility = ["PUBLIC"],
 )
 
 runtime.python_library(
@@ -19,9 +17,7 @@ runtime.python_library(
         "compiler/*.py",
         "logging.py",
     ]),
-    visibility = [
-        "@EXECUTORCH_CLIENTS",
-    ],
+    visibility = ["PUBLIC"],
     deps = [
         "fbsource//third-party/pypi/coremltools:coremltools",
         ":executorchcoreml",
@@ -36,9 +32,7 @@ runtime.python_library(
         "partition/*.py",
         "logging.py",
     ]),
-    visibility = [
-        "@EXECUTORCH_CLIENTS",
-    ],
+    visibility = ["PUBLIC"],
     deps = [
         "fbsource//third-party/pypi/coremltools:coremltools",
         ":backend",
@@ -55,9 +49,7 @@ runtime.python_library(
     srcs = glob([
         "quantizer/*.py",
     ]),
-    visibility = [
-        "@EXECUTORCH_CLIENTS",
-    ],
+    visibility = ["PUBLIC"],
 )
 
 runtime.python_library(
@@ -66,10 +58,7 @@ runtime.python_library(
         "recipes/__init__.py",
         "recipes/coreml_recipe_provider.py"
     ],
-    visibility = [
-        "@EXECUTORCH_CLIENTS",
-        "//executorch/export/...",
-    ],
+    visibility = ["PUBLIC"],
     deps = [
         "fbsource//third-party/pypi/coremltools:coremltools",
         ":coreml_recipe_types",
@@ -91,10 +80,7 @@ runtime.python_library(
     srcs = [
         "recipes/coreml_recipe_types.py",
     ],
-    visibility = [
-        "@EXECUTORCH_CLIENTS",
-        "//executorch/export/...",
-    ],
+    visibility = ["PUBLIC"],
     deps = [
         "//executorch/export:recipe",
     ],
@@ -124,10 +110,7 @@ runtime.cxx_python_extension(
     types = [
         "executorchcoreml.pyi",
     ],
-    visibility = [
-        "//executorch/examples/apple/coreml/...",
-        "@EXECUTORCH_CLIENTS",
-    ],
+    visibility = ["PUBLIC"],
     deps = [
         "fbsource//third-party/nlohmann-json:nlohmann-json",
         "fbsource//third-party/pybind11:pybind11",
 
@@ -19,9 +19,7 @@ runtime.python_library(
         "__init__.py",
         "mps_preprocess.py",
     ],
-    visibility = [
-        "@EXECUTORCH_CLIENTS",
-    ],
+    visibility = ["PUBLIC"],
     deps = [
         ":operators",
         ":serialization",
@@ -49,9 +47,7 @@ runtime.python_library(
     srcs = glob([
         "partition/*.py",
     ]),
-    visibility = [
-        "@EXECUTORCH_CLIENTS",
-    ],
+    visibility = ["PUBLIC"],
     deps = [
         ":backend",
         "//caffe2:torch",
 
@@ -39,16 +39,7 @@ def define_common_targets(is_xplat = False, platforms = []):
             "runtime/operations/*.h",
         ]),
         "srcs": MPS_BACKEND_BUCK_SRCS,
-        "visibility": [
-            "//executorch/backends/apple/...",
-            "//executorch/examples/...",
-            "//executorch/exir/backend:backend_lib",
-            "//executorch/extension/pybindings/...",
-            "//executorch/runtime/backend/...",
-            "//executorch/devtools/runners/...",
-            "//executorch/test/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
+        "visibility": ["PUBLIC"],
         "link_whole": True,
     }
 
 
@@ -90,6 +90,11 @@ def call(self, graph_module: torch.fx.GraphModule):
             args = node.args
             meta = node.meta
             match len(args):
+                case 6:
+                    # torch.ops.aten.layer_norm.default has 6 args:
+                    # (input, normalized_shape, weight, bias, eps, cudnn_enable)
+                    # cudnn_enable is not used in the decomposition
+                    x, normalized_shape, weights, bias, epsilon, _cudnn_enable = args
                 case 5:
                     x, normalized_shape, weights, bias, epsilon = args
                 case 4:
 
@@ -5,7 +5,7 @@ def define_common_targets():
         name = "vela_bin_stream",
         srcs = ["VelaBinStream.cpp"],
         exported_headers = ["VelaBinStream.h"],
-        visibility = ["@EXECUTORCH_CLIENTS"],
+        visibility = ["PUBLIC"],
         deps = [
             "//executorch/runtime/core:core",
         ],
@@ -21,7 +21,7 @@ def define_common_targets():
         supports_python_dlopen = True,
         # Constructor needed for backend registration.
         compiler_flags = ["-Wno-global-constructors"],
-        visibility = ["@EXECUTORCH_CLIENTS"],
+        visibility = ["PUBLIC"],
         deps = [
             "//executorch/runtime/backend:interface",
             ":vela_bin_stream",
 
@@ -19,7 +19,7 @@ def define_arm_tests():
         "ops/test_avg_pool2d.py",
         "ops/test_cat.py",
         "ops/test_conv2d.py",
-        "ops/test_linear.py", 
+        "ops/test_linear.py",
         "ops/test_mul.py",
         "ops/test_permute.py",
         "ops/test_rsqrt.py",
 
@@ -299,8 +299,9 @@ def advancing_feasible(self, quant_node: torch.fx.Node):
         # All the conditions satisfied, we advance.
         return True
 
-    def advance_quantize_op(self, graph_module: torch.fx.GraphModule):
+    def advance_quantize_op(self, graph_module: torch.fx.GraphModule) -> bool:
         graph = graph_module.graph
+        modified = False
         for node in reversed(graph.nodes):
             if get_overload_packet(node.target) not in (
                 exir_ops.edge.quantized_decomposed.quantize_per_tensor,
@@ -339,15 +340,19 @@ def advance_quantize_op(self, graph_module: torch.fx.GraphModule):
             # We can safely remove the quant node and trivially quantizable op
             graph.erase_node(node)
             graph.erase_node(trivially_quantizable_op)
+            modified = True
 
-        graph_module.recompile()
-        graph_module.graph.eliminate_dead_code()
+        return modified
 
     def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         self.graph_module = graph_module
-        self.advance_quantize_op(graph_module)
-        result = super().call(graph_module)
-        return result
+        modified = self.advance_quantize_op(graph_module)
+        if modified:
+            graph_module.recompile()
+            graph_module.graph.eliminate_dead_code()
+            return super().call(graph_module)
+
+        return PassResult(graph_module, False)
 
 
 @register_cadence_pass(CadencePassAttribute(opt_level=1))
@@ -474,14 +479,21 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
         # the graph (up to 3 times max, to avoid potential infinite loops)
         self.graph_module = graph_module
         iter_count = 0
-        modified = True
+        local_modified = False
+        overall_modified = False
+
+        while local_modified or iter_count == 0:
+            local_modified = self.postpone_dequantize_op(self.graph_module)
+            overall_modified |= local_modified
+
+            if local_modified:
+                self.graph_module = super().call(self.graph_module).graph_module
 
-        while modified and iter_count < 3:
-            modified = self.postpone_dequantize_op(self.graph_module)
-            self.graph_module = super().call(self.graph_module).graph_module
             iter_count += 1
+            if iter_count == 3:
+                break
 
-        return super().call(self.graph_module)
+        return PassResult(self.graph_module, overall_modified)
 
 
 @register_cadence_pass(CadencePassAttribute(opt_level=1))
 
@@ -286,13 +286,14 @@ def test_advance_branched_quantize(self) -> None:
     @torch.no_grad()
     def test_advance_quantize(self) -> None:
         builder = GraphBuilder()
-        x = builder.placeholder("x", torch.randn(16, 1, 6, 32, dtype=torch.float32))
-        weights = builder.placeholder(
-            "weights", torch.randint(-128, 127, (32, 32), dtype=torch.int8)
-        )
+        x_data = torch.randn(16, 1, 32, 6, dtype=torch.float32)
+        weight_data = torch.randint(-128, 127, (32, 32), dtype=torch.int8)
+        x = builder.placeholder("x", x_data)
+        weights = builder.placeholder("weights", weight_data)
         full = builder.call_operator(
             op=exir_ops.edge.aten.full.default,
             args=([1], -7),
+            kwargs={"dtype": torch.int32},
         )
         full_1 = builder.call_operator(
             op=exir_ops.edge.aten.full.default,
@@ -304,7 +305,8 @@ def test_advance_quantize(self) -> None:
         )
         full_3 = builder.call_operator(
             op=exir_ops.edge.aten.full.default,
-            args=([12], 0.0),
+            args=([1], 0),
+            kwargs={"dtype": torch.int32},
         )
         permute = builder.call_operator(
             op=exir_ops.edge.aten.permute_copy.default,
@@ -337,8 +339,13 @@ def test_advance_quantize(self) -> None:
 
         p1 = AdvanceQuantizeOpAboveDefInBranchPass()
         tmp_graph = cast(PassResult, p1(original_graph)).graph_module
-        p2 = AdvanceQuantizeOpAboveDefChainPass()
-        converted_graph = cast(PassResult, p2(tmp_graph)).graph_module
+        result = transform_and_check_numerics(
+            tmp_graph,
+            (x_data, weight_data),
+            AdvanceQuantizeOpAboveDefChainPass(),
+        )
+        self.assertFalse(result.modified)
+        converted_graph = result.graph_module
         # Assert that permute node is now the successor of the quant node.
         self.assertTrue(
             get_node_pos(
@@ -349,13 +356,14 @@ def test_advance_quantize(self) -> None:
 
     def test_postpone_dequantize1(self) -> None:
         builder = GraphBuilder()
-        x = builder.placeholder("x", torch.randn(1, 16, 32, 6, dtype=torch.float32))
-        weights = builder.placeholder(
-            "weights", torch.randint(-128, 127, (6, 6), dtype=torch.int8)
-        )
+        x_data = torch.randn(1, 16, 32, 6, dtype=torch.float32)
+        weight_data = torch.randint(-128, 127, (6, 6), dtype=torch.int8)
+        x = builder.placeholder("x", x_data)
+        weights = builder.placeholder("weights", weight_data)
         full = builder.call_operator(
             op=exir_ops.edge.aten.full.default,
             args=([1], -7),
+            kwargs={"dtype": torch.int32},
         )
         full_1 = builder.call_operator(
             op=exir_ops.edge.aten.full.default,
@@ -367,7 +375,8 @@ def test_postpone_dequantize1(self) -> None:
         )
         full_3 = builder.call_operator(
             op=exir_ops.edge.aten.full.default,
-            args=([12], 0.0),
+            args=([1], 0),
+            kwargs={"dtype": torch.int32},
         )
         quantize_per_tensor = builder.call_operator(
             op=exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
@@ -397,8 +406,13 @@ def test_postpone_dequantize1(self) -> None:
         )
         builder.output([permute])
         original_graph = builder.get_graph_module()
-        p = PostponeDequantizeOpBelowUseChainPass()
-        converted_graph = cast(PassResult, p(original_graph)).graph_module
+        result = transform_and_check_numerics(
+            original_graph,
+            (x_data, weight_data),
+            PostponeDequantizeOpBelowUseChainPass(),
+        )
+        self.assertTrue(result.modified)
+        converted_graph = result.graph_module
         # Assert that dequant node is now the successor of the permute node.
         self.assertTrue(
             get_node_pos(converted_graph, exir_ops.edge.aten.permute_copy.default)
 
@@ -23,10 +23,7 @@ def define_operator(name: str, deps: list[str] | None = None) -> None:
         name = op_name,
         srcs = [op_name + ".cpp"],
         platforms = CXX,
-        visibility = [
-            "//executorch/backends/cadence/...",
-            "@EXECUTORCH_CLIENTS",
-        ],
+        visibility = ["PUBLIC"],
         compatible_with = ["ovr_config//cpu:xtensa"],
         deps = deps + common_deps,
     )