pytorch
diff --git a/‎.Package.swift/kernels_torchao/dummy.swift‎ b/‎.Package.swift/kernels_torchao/dummy.swift‎
diff --git a/‎.Package.swift/kernels_torchao_debug/dummy.swift‎ b/‎.Package.swift/kernels_torchao_debug/dummy.swift‎
diff --git a/‎.ci/scripts/test_llama_torchao_lowbit.sh‎
Lines changed: 5 additions & 10 deletions b/‎.ci/scripts/test_llama_torchao_lowbit.sh‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitmodules‎
Lines changed: 0 additions & 3 deletions b/‎.gitmodules‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎CMakeLists.txt‎
Lines changed: 59 additions & 23 deletions b/‎CMakeLists.txt‎
Lines changed: 59 additions & 23 deletions
diff --git a/‎Package.swift‎
Lines changed: 5 additions & 0 deletions b/‎Package.swift‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_logit_pass.py‎
Lines changed: 96 additions & 0 deletions b/‎backends/arm/_passes/decompose_logit_pass.py‎
Lines changed: 96 additions & 0 deletions
@@ -29,27 +29,22 @@ cmake -DPYTHON_EXECUTABLE=python \
     -DEXECUTORCH_ENABLE_LOGGING=1 \
     -DCMAKE_BUILD_TYPE=Release \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=OFF \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
+    -DEXECUTORCH_BUILD_KERNELS_TORCHAO=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
     -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
     -Bcmake-out .
-cmake --build cmake-out -j16 --target install --config Release
+cmake --build cmake-out -j16 --config Release --target install
 
 # Install llama runner with torchao
 cmake -DPYTHON_EXECUTABLE=python \
-    -DBUILD_TESTING=OFF \
     -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=OFF \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_TORCHAO=ON \
     -Bcmake-out/examples/models/llama \
     examples/models/llama
 cmake --build cmake-out/examples/models/llama -j16 --config Release
 
@@ -485,7 +485,7 @@ jobs:
         eval "$(conda shell.bash hook)"
 
         # Install requirements
-        ${CONDA_RUN} EXECUTORCH_BUILD_TORCHAO=1 python install_executorch.py
+        ${CONDA_RUN} EXECUTORCH_BUILD_KERNELS_TORCHAO=1 python install_executorch.py
         ${CONDA_RUN} sh examples/models/llama/install_requirements.sh
 
         # Run test
 
@@ -1,9 +1,6 @@
 [submodule "backends/arm/third-party/ethos-u-core-driver"]
 	path = backends/arm/third-party/ethos-u-core-driver
 	url = https://git.gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-core-driver.git
-[submodule "backends/arm/third-party/serialization_lib"]
-	path = backends/arm/third-party/serialization_lib
-	url = https://git.gitlab.arm.com/tosa/tosa-serialization.git
 [submodule "backends/vulkan/third-party/Vulkan-Headers"]
 	path = backends/vulkan/third-party/Vulkan-Headers
 	url = https://github.com/KhronosGroup/Vulkan-Headers
 
@@ -278,29 +278,6 @@ if(EXECUTORCH_BUILD_PTHREADPOOL)
   )
 endif()
 
-if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
-  set(TORCHAO_BUILD_ATEN_OPS OFF)
-  set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
-  set(TORCHAO_BUILD_CPU_AARCH64 ON)
-  set(TORCHAO_ENABLE_ARM_NEON_DOT ON)
-
-  list(
-    APPEND
-    TORCHAO_INCLUDE_DIRS
-    ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
-    ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
-    ${EXECUTORCH_ROOT}/third-party/ao
-  )
-
-  set(EXECUTORCH_INCLUDE_DIRS ${TORCHAO_INCLUDE_DIRS})
-
-  add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental
-  )
-  executorch_target_link_options_shared_lib(torchao_ops_executorch)
-  list(APPEND _executorch_kernels torchao_ops_executorch)
-endif()
-
 if(EXECUTORCH_BUILD_TESTS)
   set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
   include(CTest)
@@ -705,6 +682,65 @@ if(EXECUTORCH_BUILD_PTHREADPOOL AND EXECUTORCH_BUILD_CPUINFO)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/threadpool)
 endif()
 
+if(EXECUTORCH_BUILD_KERNELS_TORCHAO)
+  if(NOT TARGET cpuinfo)
+    message(
+      FATAL_ERROR
+        "EXECUTORCH_BUILD_KERNELS_TORCHAO requires EXECUTORCH_BUILD_CPUINFO be set ON"
+    )
+  endif()
+  if(NOT TARGET pthreadpool)
+    message(
+      FATAL_ERROR
+        "EXECUTORCH_BUILD_KERNELS_TORCHAO requires EXECUTORCH_BUILD_PTHREADPOOL be set ON"
+    )
+  endif()
+
+  # Configure TorchAO kernels
+  set(TORCHAO_BUILD_ATEN_OPS OFF)
+  set(TORCHAO_BUILD_EXECUTORCH_OPS ON)
+  set(TORCHAO_BUILD_CPU_AARCH64 ON)
+  set(TORCHAO_ENABLE_ARM_NEON_DOT ON)
+  set(TORCHAO_BUILD_KLEIDIAI ON)
+
+  # TorchAO kernels look for EXECUTORCH_INCLUDE_DIRS
+  if(DEFINED EXECUTORCH_INCLUDE_DIRS)
+    message(FATAL_ERROR "EXECUTORCH_INCLUDE_DIRS is already defined")
+  endif()
+  set(EXECUTORCH_INCLUDE_DIRS
+      ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/pthreadpool/include
+      ${EXECUTORCH_ROOT}/backends/xnnpack/third-party/cpuinfo/include
+  )
+  add_subdirectory(
+    ${CMAKE_CURRENT_SOURCE_DIR}/third-party/ao/torchao/experimental
+  )
+  unset(EXECUTORCH_INCLUDE_DIRS)
+
+  executorch_target_link_options_shared_lib(torchao_ops_executorch)
+  list(APPEND _executorch_kernels torchao_ops_executorch)
+
+  install(
+    TARGETS torchao_ops_executorch torchao_kernels_aarch64
+    EXPORT ExecuTorchTargets
+    DESTINATION lib
+    INCLUDES
+    DESTINATION ${_common_include_directories}
+  )
+  # If using KleidiAI and XNNPACK has not installed it already, install it
+  if(TORCHAO_BUILD_KLEIDIAI AND NOT (EXECUTORCH_BUILD_XNNPACK
+                                     AND EXECUTORCH_XNNPACK_ENABLE_KLEIDI)
+  )
+    install(
+      TARGETS kleidiai
+      EXPORT ExecuTorchTargets
+      DESTINATION lib
+      INCLUDES
+      DESTINATION ${_common_include_directories}
+    )
+  endif()
+
+endif()
+
 if(EXECUTORCH_BUILD_PYBIND)
 
   # Add codegen tools subdirectory for selective_build pybind module
 
@@ -84,6 +84,11 @@ let products = deliverables([
     ],
   ],
   "kernels_quantized": [:],
+  "kernels_torchao": [
+    "targets": [
+      "threadpool",
+    ],
+  ],
 ])
 
 let targets = deliverables([
 
@@ -44,6 +44,7 @@
 from .decompose_leaky_relu_pass import DecomposeLeakyReLUPass  # noqa
 from .decompose_linalg_vector_norm_pass import DecomposeLinearVectorNormPass  # noqa
 from .decompose_linear_pass import DecomposeLinearPass  # noqa
+from .decompose_logit_pass import DecomposeLogitPass  # noqa
 from .decompose_masked_fill import DecomposeMaskedFill  # noqa
 from .decompose_maxpool2d_with_dilation import DecomposeMaxPool2DPass  # noqa
 from .decompose_meandim_pass import DecomposeMeanDimPass  # noqa
 
@@ -49,6 +49,7 @@
     DecomposeLeakyReLUPass,
     DecomposeLinearPass,
     DecomposeLinearVectorNormPass,
+    DecomposeLogitPass,
     DecomposeMaskedFill,
     DecomposeMaxPool2DPass,
     DecomposeMeanDimPass,
@@ -166,6 +167,7 @@ def _tosa_INT_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
 
     def _tosa_FP_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(DecomposeExpm1Pass())
+        self.add_pass(DecomposeLogitPass())
         self.add_pass(DecomposeMaskedFill())
         self.add_pass(DecomposeRoundPass())
         self.add_pass(DecomposeAcoshPass())
@@ -257,6 +259,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeEmbeddingPass())
         self.add_pass(DecomposeScaledDotProductAttention())
         self.add_pass(DecomposeRoundPass())
+        self.add_pass(DecomposeLogitPass())
         self.add_pass(CastBoolToInt8Pass())
         self.add_pass(DecomposeSignPass())
         self.add_pass(DecomposeAddmmPass())
 
@@ -0,0 +1,96 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from executorch.backends.arm._passes import ArmPass
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+# For FP case
+edge_logit = exir_ops.edge.aten.logit.default
+# For INT case
+aten_logit = torch.ops.aten.logit.default
+
+
+def get_ops(op):
+    """Returns the appropriate operator functions based on the input operator."""
+    if op == edge_logit:
+        return (
+            exir_ops.edge.aten.log.default,
+            exir_ops.edge.aten.add.Scalar,
+            exir_ops.edge.aten.reciprocal.default,
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.mul.Scalar,
+            exir_ops.edge.aten.clamp.default,
+        )
+    elif op == aten_logit:
+        return (
+            torch.ops.aten.log.default,
+            torch.ops.aten.add.Scalar,
+            torch.ops.aten.reciprocal.default,
+            torch.ops.aten.mul.Tensor,
+            torch.ops.aten.mul.Scalar,
+            torch.ops.aten.clamp.default,
+        )
+    else:
+        raise ValueError(f"Unsupported operator: {op}")
+
+
+class DecomposeLogitPass(ArmPass):
+    """
+    Decomposes the `logit` operator into a sequence of primitive operations.
+
+    If `eps` is provided, the input tensor `x` is first clamped to the range
+    [eps, 1 - eps].
+
+    The decomposition follows the identity:
+
+        logit(x) = log(x / (1 - x))
+
+    Examples:
+
+        logit(x) becomes:
+            log(x * reciprocal((-1) * x + 1))
+
+        logit(x, eps) becomes:
+            y = clamp(x, eps, 1 - eps)
+            log(y * reciprocal((-1) * y + 1))
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in [edge_logit, aten_logit]:
+            return super().call_operator(op, args, kwargs, meta)
+
+        X = args[0]
+        eps = args[1] if len(args) > 1 else kwargs.get("eps", None)
+
+        (
+            log_op,
+            add_scalar_op,
+            recip_op,
+            mul_tensor_op,
+            mul_scalar_op,
+            clamp_op,
+        ) = get_ops(op)
+
+        if eps is not None:
+            X = super().call_operator(
+                clamp_op, (X, eps, 1.0 - eps), {}, meta, updated=True
+            )
+
+        neg_X = super().call_operator(mul_scalar_op, (X, -1.0), {}, meta, updated=True)
+
+        denom = super().call_operator(
+            add_scalar_op, (neg_X, 1.0), {}, meta, updated=True
+        )
+
+        frac = super().call_operator(recip_op, (denom,), {}, meta, updated=True)
+
+        log_input = super().call_operator(
+            mul_tensor_op, (X, frac), {}, meta, updated=True
+        )
+
+        return super().call_operator(log_op, (log_input,), {}, meta, updated=True)