pytorch
diff --git a/‎.github/workflows/build-presets.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/build-presets.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎backends/vulkan/test/scripts/test_model.sh‎
Lines changed: 0 additions & 1 deletion b/‎backends/vulkan/test/scripts/test_model.sh‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎docs/source/llm/export-llm.md‎
Lines changed: 10 additions & 0 deletions b/‎docs/source/llm/export-llm.md‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎docs/source/using-executorch-faqs.md‎
Lines changed: 7 additions & 0 deletions b/‎docs/source/using-executorch-faqs.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎examples/vulkan/aot_compiler.py‎
Lines changed: 0 additions & 204 deletions b/‎examples/vulkan/aot_compiler.py‎
Lines changed: 0 additions & 204 deletions
diff --git a/‎kernels/portable/cpu/op__clone_dim_order.cpp‎
Lines changed: 80 additions & 0 deletions b/‎kernels/portable/cpu/op__clone_dim_order.cpp‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎kernels/portable/cpu/op__to_dim_order_copy.cpp‎
Lines changed: 0 additions & 23 deletions b/‎kernels/portable/cpu/op__to_dim_order_copy.cpp‎
Lines changed: 0 additions & 23 deletions
@@ -6,8 +6,6 @@ on:
     branches:
       - main
       - release/*
-    paths:
-      - .github/workflows/build-presets.yml
   workflow_dispatch:
 
 concurrency:
 
@@ -117,7 +117,6 @@ build_core_libraries_and_devtools() {
     -DEXECUTORCH_BUILD_DEVTOOLS=ON \
     -DEXECUTORCH_BUILD_VULKAN=ON \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_TESTS=ON \
     -Bcmake-out && \
   cmake --build cmake-out -j64 --target install
 
 
@@ -2,6 +2,16 @@
 
 Instead of needing to manually write code to call torch.export(), use ExecuTorch's assortment of lowering APIs, or even interact with TorchAO quantize_ APIs for quantization, we have provided an out of box experience which performantly exports a selection of supported models to ExecuTorch.
 
+## Prerequisites
+
+The LLM export functionality requires the `pytorch_tokenizers` package. If you encounter a `ModuleNotFoundError: No module named 'pytorch_tokenizers'` error, install it from the ExecutorTorch source code:
+
+```bash
+pip install -e ./extension/llm/tokenizers/
+```
+
+## Supported Models
+
 As of this doc, the list of supported LLMs include the following:
 - Llama 2/3/3.1/3.2
 - Qwen 2.5/3
 
@@ -14,6 +14,13 @@ sudo apt install python<version>-dev
 ```
 if you are using Ubuntu, or use an equivalent install command.
 
+### ModuleNotFoundError: No module named 'pytorch_tokenizers'
+
+The `pytorch_tokenizers` package is required for LLM export functionality. Install it from the ExecutorTorch source code:
+```
+pip install -e ./extension/llm/tokenizers/
+```
+
 ## Export
 
 ### Missing out variants: { _ }
 
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+
+template <typename T>
+using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
+
+/**
+ * _clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]?
+ * dim_order=None, Tensor(a!) out) -> Tensor(a!)
+ *
+ * Clones via element-wise copy while preserving dim_order.
+ */
+Tensor& _clone_dim_order_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  (void)ctx;
+
+  // Ensure input and output dtype match.
+  ET_KERNEL_CHECK(
+      ctx, self.scalar_type() == out.scalar_type(), InvalidArgument, out);
+
+  // Ensure output has the same layout as input or matches dim_order.
+  ET_KERNEL_CHECK(
+      ctx,
+      check__to_dim_order_copy_args(self, non_blocking, dim_order, out),
+      InvalidArgument,
+      out);
+
+  // Ensure input and output shapes match, resizing if necessary.
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, self.sizes()) == torch::executor::Error::Ok,
+      InvalidArgument,
+      out);
+
+  if (self.numel() == 0) {
+    return out;
+  }
+
+  // Select the correct input dtype and copy the tensors.
+  ET_SWITCH_REALHBBF16_TYPES(
+      self.scalar_type(),
+      ctx,
+      "dim_order_ops::_clone_dim_order.out",
+      CTYPE,
+      [&] { _to_dim_order_copy_impl<CTYPE, CTYPE>(self, out); });
+
+  return out;
+}
+
+Tensor& _clone_dim_order_out(
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
+  return _clone_dim_order_out(context, self, non_blocking, dim_order, out);
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
@@ -29,29 +29,6 @@ using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
 template <typename T>
 using Optional = std::optional<T>;
 
-namespace {
-
-template <typename SELF_CTYPE, typename OUT_CTYPE>
-void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
-  auto self_data = self.mutable_data_ptr<SELF_CTYPE>();
-  auto out_data = out.mutable_data_ptr<OUT_CTYPE>();
-
-  // Here we make a slightly off-label use of
-  // BroadcastIndexesRange. It always assumes it doesn't have to care
-  // about different dim_order between input and output, but we can
-  // just force it to respect strides (and thus dim_order) for its
-  // inputs using support_noncontiguous_input_tensors=true, and then pretend
-  // the output is just another input.
-  for (const auto [unused_index, self_data_index, out_data_index] :
-       BroadcastIndexesRange<2, /*support_noncontiguous_input_tensors=*/true>(
-           /*dummy output*/ self, self, out)) {
-    (void)unused_index;
-    out_data[out_data_index] =
-        static_cast<OUT_CTYPE>(self_data[self_data_index]);
-  }
-}
-} // namespace
-
 // _to_dim_order_copy.out(Tensor self, *, bool non_blocking=False, int[]?
 // dim_order=None, Tensor(a!) out) -> Tensor(a!)
 Tensor& _to_dim_order_copy_out(