pytorch
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/aoti/common_shims.h‎
Lines changed: 1 addition & 0 deletions b/‎backends/aoti/common_shims.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/aoti/slim/c10/core/Device.h‎
Lines changed: 5 additions & 7 deletions b/‎backends/aoti/slim/c10/core/Device.h‎
Lines changed: 5 additions & 7 deletions
diff --git a/‎backends/aoti/slim/core/Storage.h‎
Lines changed: 1 addition & 1 deletion b/‎backends/aoti/slim/core/Storage.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/aoti/slim/core/test/test_storage_cpu.cpp‎
Lines changed: 3 additions & 6 deletions b/‎backends/aoti/slim/core/test/test_storage_cpu.cpp‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎backends/arm/MODELS.md‎
Lines changed: 20 additions & 0 deletions b/‎backends/arm/MODELS.md‎
Lines changed: 20 additions & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass.py‎
Lines changed: 17 additions & 2 deletions b/‎backends/arm/_passes/arm_pass.py‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 39 additions & 37 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 39 additions & 37 deletions
diff --git a/‎backends/arm/_passes/cast_int64_pass.py‎
Lines changed: 2 additions & 2 deletions b/‎backends/arm/_passes/cast_int64_pass.py‎
Lines changed: 2 additions & 2 deletions
@@ -1 +1 @@
-7a064ed3eafa43f17412d434b395240c727b3000
+7a79b41e29a790ebb4b530eb98a89381e2d7de29
@@ -64,6 +64,7 @@ AOTI_SHIM_EXPORT int32_t aoti_torch_device_type_cpu();
 AOTI_SHIM_EXPORT int32_t aoti_torch_layout_strided();
 AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_float32();
 AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bfloat16();
+AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_bool();
 AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int8();
 AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int16();
 AOTI_SHIM_EXPORT int32_t aoti_torch_dtype_int32();
 
@@ -26,27 +26,25 @@ using DeviceIndex = int8_t;
 /// Represents a compute device on which a tensor is located.
 /// A device is uniquely identified by a type (e.g., CPU) and a device index.
 struct Device final {
-  using Type = DeviceType;
-
   /// Constructs a new Device from a DeviceType and an optional device index.
   /// @param type The type of device.
   /// @param index The device index. For CPU, this should be -1 or 0.
   /* implicit */
-  Device(DeviceType type, DeviceIndex index = -1) : type_(type), index_(index) {
+  explicit Device(DeviceType type, DeviceIndex index = -1)
+      : type_(type), index_(index) {
     validate();
   }
 
   /// Constructs a Device from a string description.
   /// The string must be "cpu" or "cpu:0".
-  /* implicit */ Device(const std::string& device_string) : Device(Type::CPU) {
+  /* implicit */ Device(const std::string& device_string)
+      : Device(DeviceType::CPU) {
     ET_CHECK_MSG(!device_string.empty(), "Device string must not be empty");
 
     if (device_string == "cpu" || device_string == "CPU") {
       type_ = DeviceType::CPU;
       index_ = -1;
-    } else if (
-        device_string == "cpu:0" || device_string == "CPU:0" ||
-        device_string == "cpu:1" || device_string == "CPU:1") {
+    } else if (device_string == "cpu:0" || device_string == "CPU:0") {
       type_ = DeviceType::CPU;
       index_ = static_cast<DeviceIndex>(device_string.back() - '0');
     } else {
 
@@ -27,7 +27,7 @@ inline void noop(void*) {}
 } // namespace detail
 
 /// Default CPU device constant.
-const c10::Device CPU_DEVICE = c10::Device(c10::DeviceType::CPU, 0);
+inline const c10::Device CPU_DEVICE = c10::Device(c10::DeviceType::CPU, 0);
 
 /// DeviceTraits template for device-specific operations.
 /// Device-specific implementations provide allocate(), free(), and memcpy().
 
@@ -190,7 +190,7 @@ TEST(StorageSharedPtrTest, SharedOwnership) {
   Storage storage1(new MaybeOwningStorage(CPU_DEVICE, kNbytes));
   void* data_ptr = storage1->data();
 
-  Storage storage2 = storage1;
+  const Storage& storage2 = storage1;
 
   EXPECT_EQ(storage1.use_count(), 2);
   EXPECT_EQ(storage2.use_count(), 2);
@@ -208,7 +208,7 @@ TEST(StorageSharedPtrTest, SharedOwnershipModification) {
     data[i] = 0.0f;
   }
 
-  Storage storage2 = storage1;
+  const Storage& storage2 = storage1;
 
   float* data2 = static_cast<float*>(storage2->data());
   for (size_t i = 0; i < kNumFloats; ++i) {
@@ -226,10 +226,7 @@ TEST(StorageSharedPtrTest, ReferenceCountDecrement) {
   Storage storage1(new MaybeOwningStorage(CPU_DEVICE, kNbytes));
   EXPECT_EQ(storage1.use_count(), 1);
 
-  {
-    Storage storage2 = storage1;
-    EXPECT_EQ(storage1.use_count(), 2);
-  }
+  { EXPECT_EQ(storage1.use_count(), 2); }
 
   EXPECT_EQ(storage1.use_count(), 1);
 }
 
@@ -0,0 +1,20 @@
+# The following file contains all models that have been confirmed to be functional and tested for the Arm backend:
+- Conformer
+- Deit Tiny
+- DeepLab v3 (DL3)
+- Inception v3 (IC3)
+- Llama
+- Long Short-Term Memory (LSTM)
+- MobileNet v2 (MV2)
+- MobileNet v3 (MV3)
+- Some popular torch.nn.functional models (NN functional)
+- Some popular torch.nn.modules models (NN modules)
+- Some popular torch ops (Torch Functions)
+- Neural Super Sampler (NSS)
+- ResNet 18
+- Wav2Letter (W2L)
+- Stable Diffusion:
+    * CLIP Text Encoder (CLIP Text with Projection)
+    * Stable Diffusion 3 Transformer (SD3 Transformer)
+    * T5 Encoder
+    * VAE Encoder/Decoder (VAE)
@@ -113,6 +113,9 @@
 from .replace_scalar_with_tensor_pass import (  # noqa
     ReplaceScalarWithTensorByProfilePass,
 )
+from .rewrite_bool_to_fp32_cast_via_int8_pass import (  # noqa
+    RewriteBoolToFp32CastViaInt8Pass,
+)
 from .rewrite_conv_pass import RewriteConvPass  # noqa
 from .rewrite_matmul import RewriteMatmulPass  # noqa
 from .rewrite_upsample import RewriteUpsamplePass  # noqa
 
@@ -8,6 +8,7 @@
 from abc import abstractmethod
 from typing import Any, List, Optional, Set, Type
 
+from executorch.backends.arm.constants import DISALLOW_TFA_META_KEY
 from executorch.exir.pass_base import ExportPass, NodeMetadata
 from torch.fx import GraphModule
 from torch.fx.passes.infra.pass_base import PassResult
@@ -16,9 +17,23 @@
 class ArmPass(ExportPass):
     """Base class for Arm passes"""
 
-    def __init__(self) -> None:
-        super().__init__()
+    def __init__(self, tfa_pass: bool = False, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
         self.submodule_depth = 0
+        self.is_tfa_pass = tfa_pass
+
+    def allowed_to_transform(self, meta: NodeMetadata | dict[str, Any]) -> bool:
+        if not self.is_tfa_pass:
+            return True
+
+        if isinstance(meta, NodeMetadata):
+            meta_dict = meta.data
+        else:
+            meta_dict = meta
+
+        disallow_tfa = meta_dict.get(DISALLOW_TFA_META_KEY, False)
+
+        return not disallow_tfa
 
     @property
     @abstractmethod
 
@@ -103,6 +103,7 @@
     RemoveNoopPass,
     ReplaceInfAndLimitValuesPass,
     ReplaceScalarWithTensorByProfilePass,
+    RewriteBoolToFp32CastViaInt8Pass,
     RewriteConvPass,
     RewriteMatmulPass,
     RewriteUpsamplePass,
@@ -221,6 +222,7 @@ def _tosa_pipeline(
         self.add_passes(
             [
                 FuseQuantizedActivationPass(),
+                RewriteBoolToFp32CastViaInt8Pass(),
                 ConvertToClampPass(),
                 DecomposeTOSAUnsupportedClampPass(),
                 DecomposeGroupNormPass(),
@@ -374,65 +376,65 @@ def transform_to_backend_pipeline(
 
     def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         # Preprocessing passes
-        self.add_pass(RemoveGraphAssertsPass())
+        self.add_pass(RemoveGraphAssertsPass(tfa_pass=True))
 
         # Transformation passes (pre scalar -> tensor)
         self.add_passes(
             [
-                DecomposeSelectScatterPass(),
-                ConvertInt64ConstOpsToInt32Pass(),
-                ConvertInt64OutputOpsToInt32Pass(),
-                InsertInt32CastsAfterInt64PlaceholdersPass(),
-                DecomposeEmbeddingPass(),
-                DecomposeScaledDotProductAttentionPass(),
-                DecomposeRoundPass(),
-                DecomposeLogitPass(),
-                PromoteBoolOperandsPass(),
-                DecomposeSignPass(),
-                DecomposeAddmmPass(),
-                DecomposeRemainderPass(),
-                DecomposeFloorDividePass(),
-                DecomposeDivTensorModePass(),
+                DecomposeSelectScatterPass(tfa_pass=True),
+                ConvertInt64ConstOpsToInt32Pass(tfa_pass=True),
+                ConvertInt64OutputOpsToInt32Pass(tfa_pass=True),
+                InsertInt32CastsAfterInt64PlaceholdersPass(tfa_pass=True),
+                DecomposeEmbeddingPass(tfa_pass=True),
+                DecomposeScaledDotProductAttentionPass(tfa_pass=True),
+                DecomposeRoundPass(tfa_pass=True),
+                DecomposeLogitPass(tfa_pass=True),
+                PromoteBoolOperandsPass(tfa_pass=True),
+                DecomposeSignPass(tfa_pass=True),
+                DecomposeAddmmPass(tfa_pass=True),
+                DecomposeRemainderPass(tfa_pass=True),
+                DecomposeFloorDividePass(tfa_pass=True),
+                DecomposeDivTensorModePass(tfa_pass=True),
             ]
         )
 
         # Scalars -> tensors
         self.add_passes(
             [
-                ReplaceScalarWithTensorByProfilePass(),
-                ScalarsToAttributePass(),
+                ReplaceScalarWithTensorByProfilePass(tfa_pass=True),
+                ScalarsToAttributePass(tfa_pass=True),
             ]
         )
 
         # Transformation passes (post scalar removal)
         self.add_passes(
             [
-                NormalizeWhileInitialArgsPass(use_exir_clone=False),
-                DecomposeAddSubAlphaPass(),
-                DecomposeGroupNormPass(),
-                DecomposeLayerNormPass(),
-                DecomposeVarPass(),
-                DecomposeMeanDimPass(graph_module, self.tosa_spec),
-                DecomposeNotEqualPass(),
-                DecomposeCosineSimilarityPass(),
-                DecomposeGluPass(),
-                DecomposeDivPass(),
-                DecomposeLeakyReLUPass(),
-                DecomposeLinalgVectorNormPass(),
-                DecomposeSqrtPass(),
-                DecomposeSiluPass(),
-                DecomposeAvgPool2dPass(),
-                DecomposeSoftmaxUnstablePass(),
-                DecomposeSoftmaxPass(),
-                ConvertMinMaxPass(),
+                NormalizeWhileInitialArgsPass(use_exir_clone=False, tfa_pass=True),
+                DecomposeAddSubAlphaPass(tfa_pass=True),
+                DecomposeGroupNormPass(tfa_pass=True),
+                DecomposeLayerNormPass(tfa_pass=True),
+                DecomposeVarPass(tfa_pass=True),
+                DecomposeMeanDimPass(graph_module, self.tosa_spec, tfa_pass=True),
+                DecomposeNotEqualPass(tfa_pass=True),
+                DecomposeCosineSimilarityPass(tfa_pass=True),
+                DecomposeGluPass(tfa_pass=True),
+                DecomposeDivPass(tfa_pass=True),
+                DecomposeLeakyReLUPass(tfa_pass=True),
+                DecomposeLinalgVectorNormPass(tfa_pass=True),
+                DecomposeSqrtPass(tfa_pass=True),
+                DecomposeSiluPass(tfa_pass=True),
+                DecomposeAvgPool2dPass(tfa_pass=True),
+                DecomposeSoftmaxUnstablePass(tfa_pass=True),
+                DecomposeSoftmaxPass(tfa_pass=True),
+                ConvertMinMaxPass(tfa_pass=True),
             ]
         )
 
         # Postprocessing passes
         self.add_passes(
             [
-                ReplaceInfAndLimitValuesPass(),
-                DecomposeMaskedFillPass(),
+                ReplaceInfAndLimitValuesPass(tfa_pass=True),
+                DecomposeMaskedFillPass(tfa_pass=True),
             ]
         )
 
 
@@ -23,8 +23,8 @@ class CastInt64BuffersToInt32Pass(ArmPass):
 
     _passes_required_after: Set[Type[ExportPass]] = set()
 
-    def __init__(self, exported_program: ExportedProgram):
-        super().__init__()
+    def __init__(self, exported_program: ExportedProgram, *args, **kwargs):
+        super().__init__(*args, **kwargs)
         self.exported_program = exported_program
 
     def _assert_within_int32(self, tensor: torch.Tensor, node: torch.fx.Node):
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-7a064ed3eafa43f17412d434b395240c727b3000`
	`1`	`+7a79b41e29a790ebb4b530eb98a89381e2d7de29`