cccclai
diff --git a/‎backends/xnnpack/CMakeLists.txt‎
Lines changed: 10 additions & 4 deletions b/‎backends/xnnpack/CMakeLists.txt‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎backends/xnnpack/cmake/Dependencies.cmake‎
Lines changed: 27 additions & 1 deletion b/‎backends/xnnpack/cmake/Dependencies.cmake‎
Lines changed: 27 additions & 1 deletion
diff --git a/‎backends/xnnpack/runtime/XNNCompiler.cpp‎
Lines changed: 7 additions & 0 deletions b/‎backends/xnnpack/runtime/XNNCompiler.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎backends/xnnpack/targets.bzl‎
Lines changed: 2 additions & 0 deletions b/‎backends/xnnpack/targets.bzl‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/xnnpack/test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎backends/xnnpack/test/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/xnnpack/test/runtime/test_xnnexecutor.cpp‎
Lines changed: 5 additions & 5 deletions b/‎backends/xnnpack/test/runtime/test_xnnexecutor.cpp‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎backends/xnnpack/test/targets.bzl‎
Lines changed: 1 addition & 0 deletions b/‎backends/xnnpack/test/targets.bzl‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/xnnpack/third-party/XNNPACK‎ b/‎backends/xnnpack/third-party/XNNPACK‎
diff --git a/‎backends/xnnpack/third-party/cpuinfo‎ b/‎backends/xnnpack/third-party/cpuinfo‎
diff --git a/‎backends/xnnpack/third-party/generate-xnnpack-wrappers.py‎
Lines changed: 0 additions & 213 deletions b/‎backends/xnnpack/third-party/generate-xnnpack-wrappers.py‎
Lines changed: 0 additions & 213 deletions
@@ -32,14 +32,20 @@ if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
 endif()
 
-# NB: Enabling this will serialize execution of delegate instances.
-# This setting may have performance implications.
+# NB: Enabling this will serialize execution of delegate instances
+# Keeping this OFF by default to maintain existing behavior, to be revisited.
 option(EXECUTORCH_XNNPACK_SHARED_WORKSPACE
-       "Enable workspace sharing across different delegate instances" ON
-)
+  "Enable workspace sharing across different delegate instances" ON)
+# Keeping this OFF by default due to regressions in decode
+# and model load with kleidi kernels
+option(EXECUTORCH_XNNPACK_ENABLE_KLEIDI
+  "Enable workspace sharing across different delegate instances" OFF)
 if(EXECUTORCH_XNNPACK_SHARED_WORKSPACE)
   add_definitions(-DENABLE_XNNPACK_SHARED_WORKSPACE)
 endif()
+if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI)
+  add_definitions(-DENABLE_XNNPACK_KLEIDI)
+endif()
 
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 
@@ -36,13 +36,39 @@ set(XNNPACK_ENABLE_AVXVNNI
     OFF
     CACHE BOOL ""
 )
-set(XNNPACK_ENABLE_KLEIDIAI
+
+if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI)
+    set(XNNPACK_ENABLE_KLEIDIAI
+        ON
+        CACHE BOOL ""
+    )
+else()
+    set(XNNPACK_ENABLE_KLEIDIAI
+        OFF
+        CACHE BOOL ""
+    )
+endif()
+
+
+set(XNNPACK_BUILD_ALL_MICROKERNELS
     OFF
     CACHE BOOL ""
 )
 add_subdirectory("${XNNPACK_SOURCE_DIR}")
 include_directories(SYSTEM ${XNNPACK_INCLUDE_DIR})
 list(APPEND xnnpack_third_party XNNPACK)
+install(TARGETS microkernels-prod
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+
+
+if(EXECUTORCH_XNNPACK_ENABLE_KLEIDI)
+    install(TARGETS kleidiai
+        LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+        PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+endif()
 
 # Revert PIC Flag to what it originally was
 set(CMAKE_POSITION_INDEPENDENT_CODE
 
@@ -630,7 +630,14 @@ Error defineConvertNode(
       subgraph_ptr,
       remapped_ids.at(graph_node->input_id()),
       remapped_ids.at(graph_node->output_id()),
+#ifdef ENABLE_XNNPACK_KLEIDI
+      // This maps to XNNPACK's XNN_FLAG_MAYBE_PACK_FOR_QB4W_GEMM
+      // however this is not currently exposed at top level
+      // xnnpack.h Header
+      0x00000100);
+#else
       graph_node->flags());
+#endif
 
   ET_CHECK_OR_RETURN_ERROR(
       status == xnn_status_success,
 
@@ -49,6 +49,8 @@ def define_common_targets():
         preprocessor_flags = [
             # Uncomment to enable per operator timings
             # "-DENABLE_XNNPACK_PROFILING",
+            # Uncomment to enable using KleidiAI Kernels
+            # "-DENABLE_XNNPACK_KLEIDI"
         ] + _get_preprocessor_flags(),
         exported_deps = [
             "//executorch/runtime/backend:interface",
 
@@ -39,6 +39,7 @@ et_cxx_test(
   XNNPACK
   pthreadpool
   cpuinfo
+  microkernels-prod
 )
 target_include_directories(
   backends_xnnpack_test
 
@@ -9,7 +9,7 @@
 #include <executorch/backends/xnnpack/runtime/XNNExecutor.h>
 #include <executorch/runtime/core/exec_aten/testing_util/tensor_factory.h>
 #include <gtest/gtest.h>
-#include <xnnpack/subgraph.h>
+#include <xnnpack.h>
 
 using torch::executor::Error;
 using torch::executor::EValue;
@@ -26,7 +26,7 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
   std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> auto_subgraph(
       subgraph, xnn_delete_subgraph);
 
-  auto input_id = XNN_INVALID_NODE_ID;
+  auto input_id = XNN_INVALID_VALUE_ID;
   std::vector<size_t> dims = {
       1,
   };
@@ -43,9 +43,9 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
           /*external_id=*/0,
           /*flags=*/XNN_VALUE_FLAG_EXTERNAL_INPUT,
           &input_id));
-  ASSERT_NE(input_id, XNN_INVALID_NODE_ID);
+  ASSERT_NE(input_id, XNN_INVALID_VALUE_ID);
 
-  auto output_id = XNN_INVALID_NODE_ID;
+  auto output_id = XNN_INVALID_VALUE_ID;
   ASSERT_EQ(
       xnn_status_success,
       xnn_define_quantized_tensor_value(
@@ -59,7 +59,7 @@ TEST(XNNExecutorTest, ArgumentWithTooManyDimensions) {
           /*external_id=*/0,
           /*flags=*/XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
           &output_id));
-  ASSERT_NE(output_id, XNN_INVALID_NODE_ID);
+  ASSERT_NE(output_id, XNN_INVALID_VALUE_ID);
 
   ASSERT_EQ(
       xnn_status_success,
 
@@ -24,6 +24,7 @@ def define_common_targets():
         srcs = ["runtime/test_xnnexecutor.cpp"],
         deps = [
             third_party_dep("XNNPACK"),
+            third_party_dep("FP16"),
             "//executorch/runtime/core/exec_aten/testing_util:tensor_util",
             "//executorch/runtime/core/exec_aten/util:scalar_type_util",
             "//executorch/backends/xnnpack:xnnpack_backend",
Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,7 @@ et_cxx_test(`
`39`	`39`	`XNNPACK`
`40`	`40`	`pthreadpool`
`41`	`41`	`cpuinfo`
	`42`	`+ microkernels-prod`
`42`	`43`	`)`
`43`	`44`	`target_include_directories(`
`44`	`45`	`backends_xnnpack_test`