pytorch
diff --git a/‎.github/workflows/build-presets.yml‎
Lines changed: 0 additions & 2 deletions b/‎.github/workflows/build-presets.yml‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎examples/qualcomm/oss_scripts/llama/decoder_utils.py‎
Lines changed: 109 additions & 53 deletions b/‎examples/qualcomm/oss_scripts/llama/decoder_utils.py‎
Lines changed: 109 additions & 53 deletions
diff --git a/‎kernels/portable/cpu/op__clone_dim_order.cpp‎
Lines changed: 80 additions & 0 deletions b/‎kernels/portable/cpu/op__clone_dim_order.cpp‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎kernels/portable/cpu/op__to_dim_order_copy.cpp‎
Lines changed: 0 additions & 23 deletions b/‎kernels/portable/cpu/op__to_dim_order_copy.cpp‎
Lines changed: 0 additions & 23 deletions
diff --git a/‎kernels/portable/cpu/util/copy_ops_util.h‎
Lines changed: 24 additions & 0 deletions b/‎kernels/portable/cpu/util/copy_ops_util.h‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎kernels/portable/cpu/util/targets.bzl‎
Lines changed: 3 additions & 1 deletion b/‎kernels/portable/cpu/util/targets.bzl‎
Lines changed: 3 additions & 1 deletion
@@ -6,8 +6,6 @@ on:
     branches:
       - main
       - release/*
-    paths:
-      - .github/workflows/build-presets.yml
   workflow_dispatch:
 
 concurrency:
 
@@ -219,37 +219,42 @@ def post_process():
 
 
 def smart_mask_updater(
-    ar_len, atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches
+    _, n_updates, atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches
 ):
-    # Update the KV cache input for the next inference when the position exceeds the autoregressive length.
-    if pos >= ar_len:
+    # ar_len is unused in smart mask
+    max_cache_len = k_caches[0].size(-1)
+    if pos + n_updates <= max_cache_len:
         for i, k_cache in enumerate(k_caches):
-            k_cache[:, :, pos - ar_len] = new_k_caches[i][:, :, 0]
+            k_cache[:, :, pos : pos + n_updates] = new_k_caches[i][:, :, :n_updates]
 
         for i, v_cache in enumerate(v_caches):
-            v_cache[:, pos - ar_len, :] = new_v_caches[i][:, 0, :]
-        atten_mask[:, :, pos - ar_len] = 0
+            v_cache[:, pos : pos + n_updates, :] = new_v_caches[i][:, :n_updates, :]
+        atten_mask[:, :, pos : pos + n_updates] = 0
+    pos += n_updates
 
-    pos += 1
     return (atten_mask, pos, k_caches, v_caches)
 
 
 def shift_pointer_updater(
-    ar_len, atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches
+    ar_len, n_updates, atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches
 ):
-    # Update the KV cache input for the next inference when the position exceeds the autoregressive length.
-    if pos >= ar_len:
+    max_cache_len = k_caches[0].size(-1)
+    if pos + n_updates <= max_cache_len:
         k_caches = [
-            torch.cat([k_cache[:, :, 1:], new_k_caches[i][:, :, :1]], dim=-1)
+            torch.cat(
+                [k_cache[:, :, n_updates:], new_k_caches[i][:, :, :n_updates]], dim=-1
+            )
             for i, k_cache in enumerate(k_caches)
         ]
         v_caches = [
-            torch.cat([v_cache[:, 1:, :], new_v_caches[i][:, :1, :]], dim=1)
+            torch.cat(
+                [v_cache[:, n_updates:, :], new_v_caches[i][:, :n_updates, :]], dim=1
+            )
             for i, v_cache in enumerate(v_caches)
         ]
-        atten_mask[:, :, -pos - 1] = 0
+        atten_mask[:, :, -pos - n_updates - ar_len : -pos - ar_len] = 0
+    pos += n_updates
 
-    pos += 1
     return (atten_mask, pos, k_caches, v_caches)
 
 
@@ -269,70 +274,121 @@ def kv_inference(
     # TODO: change criteria & support batch inputs if necessary
     all_pos = torch.arange(0, max_seq_len, 1, dtype=torch.int32).unsqueeze(0)
 
-    token_list, result_logits = [], []
+    prompt_token_list, total_token_list, result_logits = [], [], []
 
     if isinstance(prompt, str):
         # Llama2 tokenizer has no special tokens
         if isinstance(tokenizer, (SentencePieceTokenizer, HuggingFaceTokenizer)):
-            token_list = tokenizer.encode(prompt, bos=True, eos=False)
+            prompt_token_list = tokenizer.encode(prompt, bos=True, eos=False)
         elif isinstance(tokenizer, TiktokenTokenizer):
-            token_list = tokenizer.encode(
+            prompt_token_list = tokenizer.encode(
                 prompt, bos=True, eos=False, allowed_special="all"
             )
         else:
             raise RuntimeError("Unknown tokenizer")
     else:
         # pyre-ignore
-        token_list = prompt.flatten().tolist()
-    pos = len(token_list) if len(token_list) < ar_len else ar_len
+        prompt_token_list = prompt.flatten().tolist()
+    total_token_list = prompt_token_list
     dtype = torch.int64 if use_i64_token else torch.int32
 
     with torch.no_grad():
-        while token_list[-1] != tokenizer.eos_id and pos < max_seq_len:
-            tmp_token_list = torch.tensor(
-                token_list[pos - ar_len : pos], dtype=dtype
-            ).reshape(1, -1)
-            tmp_pos = all_pos[:, pos - ar_len : pos]
-            tmp_atten_mask = atten_mask
-            if pos < ar_len:
-                tmp_token_list = torch.cat(
-                    [
-                        torch.zeros((1, ar_len - pos), dtype=dtype),
-                        torch.tensor(token_list, dtype=dtype).reshape(1, -1),
-                    ],
-                    dim=1,
-                )
-                tmp_pos = torch.cat(
-                    [
-                        torch.zeros((1, ar_len - pos), dtype=torch.int32),
-                        all_pos[:, :pos],
-                    ],
-                    dim=1,
-                )
-                tmp_atten_mask = torch.cat(
-                    [
-                        torch.ones(1, ar_len, max_seq_len - pos) * -255.0,
-                        atten_mask[:, :, -pos:],
-                    ],
-                    dim=-1,
-                )
+        # Phase 1: Prefill the prompt in ar_len chunks.
+        num_prompt_tokens = len(prompt_token_list)
+        pos = 0  # Tracks how many prompt tokens have been processed.
+        while pos < num_prompt_tokens:
+            chunk_start_idx = pos
+            # Take a chunk of prompt tokens, up to ar_len length.
+            chunk_end_idx = min(num_prompt_tokens, pos + ar_len)
+            actual_chunk_tokens = prompt_token_list[chunk_start_idx:chunk_end_idx]
+            num_tokens_in_chunk = len(actual_chunk_tokens)
+
+            # Prepare tmp_token_list (padded with zeros).
+            tmp_token_list = torch.zeros((1, ar_len), dtype=dtype)
+            tmp_token_list[0, :num_tokens_in_chunk] = torch.tensor(
+                actual_chunk_tokens, dtype=dtype
+            )
 
+            # Prepare tmp_pos (padded with zeros).
+            tmp_pos = torch.zeros((1, ar_len), dtype=torch.int32)
+            tmp_pos[0, :num_tokens_in_chunk] = all_pos[
+                0,
+                pos : pos + num_tokens_in_chunk,
+            ]
+
+            # Run inference.
             logits, new_k_caches, new_v_caches = module(
                 tmp_token_list,
-                tmp_atten_mask,
+                atten_mask,
                 tmp_pos,
                 *k_caches,
                 *v_caches,
             )
             if collect_logits:
-                result_logits.append(logits)
+                result_logits.append(logits[:, :num_tokens_in_chunk])
+
+            # Update the pos, KV cache and attention mask.
             atten_mask, pos, k_caches, v_caches = kv_updater(
-                ar_len, atten_mask, pos, k_caches, v_caches, new_k_caches, new_v_caches
+                ar_len,
+                num_tokens_in_chunk,
+                atten_mask,
+                pos,
+                k_caches,
+                v_caches,
+                new_k_caches,
+                new_v_caches,
+            )
+        # Append the last run logits to the total_token_list.
+        total_token_list.append(
+            torch.argmax(logits[:, num_tokens_in_chunk - 1], dim=-1).item()
+        )
+
+        # Phase 2: Generate tokens until the EOS token is generated or max_seq_len is reached.
+        # When run on wikitext for ppl evaluation, this while-loop is not expected to run.
+        max_cache_len = max_seq_len - ar_len
+        num_tokens = len(total_token_list)
+        while total_token_list[-1] != tokenizer.eos_id and num_tokens < max_seq_len:
+            chunk_start_idx = min(pos, max_cache_len)
+            # Take a chunk of generated tokens, up to ar_len length.
+            chunk_end_idx = num_tokens
+            actual_chunk_tokens = total_token_list[chunk_start_idx:chunk_end_idx]
+            num_tokens_in_chunk = len(actual_chunk_tokens)
+
+            # Prepare tmp_token_list (padded with zeros).
+            tmp_token_list = torch.zeros((1, ar_len), dtype=dtype)
+            tmp_token_list[0, :num_tokens_in_chunk] = torch.tensor(
+                actual_chunk_tokens, dtype=dtype
+            )
+
+            # Prepare tmp_pos (padded with zeros).
+            tmp_pos = torch.zeros((1, ar_len), dtype=torch.int32)
+            tmp_pos[0, :num_tokens_in_chunk] = all_pos[0, chunk_start_idx:chunk_end_idx]
+
+            logits, new_k_caches, new_v_caches = module(
+                tmp_token_list,
+                atten_mask,
+                tmp_pos,
+                *k_caches,
+                *v_caches,
             )
-            if pos > len(token_list):
-                token_list.append(torch.argmax(logits[:, -1], dim=-1).item())
+            if collect_logits:
+                result_logits.append(logits[:, :num_tokens_in_chunk])
 
-    logging.info(f"kv inference result:\n{tokenizer.decode(token_list)}")
+            atten_mask, pos, k_caches, v_caches = kv_updater(
+                ar_len,
+                1,
+                atten_mask,
+                pos,
+                k_caches,
+                v_caches,
+                new_k_caches,
+                new_v_caches,
+            )
+            total_token_list.append(
+                torch.argmax(logits[:, num_tokens_in_chunk - 1], dim=-1).item()
+            )
+            num_tokens = len(total_token_list)
+    logging.info(f"kv inference result:\n{tokenizer.decode(total_token_list)}")
     if collect_logits:
         result_logits = torch.cat(result_logits, dim=1)
     return result_logits
 
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/copy_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+namespace native {
+
+using Tensor = executorch::aten::Tensor;
+
+template <typename T>
+using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
+
+/**
+ * _clone_dim_order.out(Tensor self, *, bool non_blocking=False, int[]?
+ * dim_order=None, Tensor(a!) out) -> Tensor(a!)
+ *
+ * Clones via element-wise copy while preserving dim_order.
+ */
+Tensor& _clone_dim_order_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  (void)ctx;
+
+  // Ensure input and output dtype match.
+  ET_KERNEL_CHECK(
+      ctx, self.scalar_type() == out.scalar_type(), InvalidArgument, out);
+
+  // Ensure output has the same layout as input or matches dim_order.
+  ET_KERNEL_CHECK(
+      ctx,
+      check__to_dim_order_copy_args(self, non_blocking, dim_order, out),
+      InvalidArgument,
+      out);
+
+  // Ensure input and output shapes match, resizing if necessary.
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_tensor(out, self.sizes()) == torch::executor::Error::Ok,
+      InvalidArgument,
+      out);
+
+  if (self.numel() == 0) {
+    return out;
+  }
+
+  // Select the correct input dtype and copy the tensors.
+  ET_SWITCH_REALHBBF16_TYPES(
+      self.scalar_type(),
+      ctx,
+      "dim_order_ops::_clone_dim_order.out",
+      CTYPE,
+      [&] { _to_dim_order_copy_impl<CTYPE, CTYPE>(self, out); });
+
+  return out;
+}
+
+Tensor& _clone_dim_order_out(
+    const Tensor& self,
+    bool non_blocking,
+    OptionalArrayRef<int64_t> dim_order,
+    Tensor& out) {
+  executorch::ET_RUNTIME_NAMESPACE::KernelRuntimeContext context{};
+  return _clone_dim_order_out(context, self, non_blocking, dim_order, out);
+}
+
+} // namespace native
+} // namespace executor
+} // namespace torch
@@ -29,29 +29,6 @@ using OptionalArrayRef = executorch::aten::OptionalArrayRef<T>;
 template <typename T>
 using Optional = std::optional<T>;
 
-namespace {
-
-template <typename SELF_CTYPE, typename OUT_CTYPE>
-void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
-  auto self_data = self.mutable_data_ptr<SELF_CTYPE>();
-  auto out_data = out.mutable_data_ptr<OUT_CTYPE>();
-
-  // Here we make a slightly off-label use of
-  // BroadcastIndexesRange. It always assumes it doesn't have to care
-  // about different dim_order between input and output, but we can
-  // just force it to respect strides (and thus dim_order) for its
-  // inputs using support_noncontiguous_input_tensors=true, and then pretend
-  // the output is just another input.
-  for (const auto [unused_index, self_data_index, out_data_index] :
-       BroadcastIndexesRange<2, /*support_noncontiguous_input_tensors=*/true>(
-           /*dummy output*/ self, self, out)) {
-    (void)unused_index;
-    out_data[out_data_index] =
-        static_cast<OUT_CTYPE>(self_data[self_data_index]);
-  }
-}
-} // namespace
-
 // _to_dim_order_copy.out(Tensor self, *, bool non_blocking=False, int[]?
 // dim_order=None, Tensor(a!) out) -> Tensor(a!)
 Tensor& _to_dim_order_copy_out(
 
@@ -9,6 +9,7 @@
 #pragma once
 #include <c10/util/irange.h>
 
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
 #include <executorch/runtime/kernel/kernel_includes.h>
 
 namespace torch {
@@ -77,6 +78,29 @@ void as_strided_copy(
   }
 }
 
+/**
+ * Copies and casts a tensor while preserving input dim_order.
+ */
+template <typename SELF_CTYPE, typename OUT_CTYPE>
+void _to_dim_order_copy_impl(const Tensor& self, Tensor& out) {
+  auto self_data = self.mutable_data_ptr<SELF_CTYPE>();
+  auto out_data = out.mutable_data_ptr<OUT_CTYPE>();
+
+  // Here we make a slightly off-label use of
+  // BroadcastIndexesRange. It always assumes it doesn't have to care
+  // about different dim_order between input and output, but we can
+  // just force it to respect strides (and thus dim_order) for its
+  // inputs using support_noncontiguous_input_tensors=true, and then pretend
+  // the output is just another input.
+  for (const auto [unused_index, self_data_index, out_data_index] :
+       BroadcastIndexesRange<2, /*support_noncontiguous_input_tensors=*/true>(
+           /*dummy output*/ self, self, out)) {
+    (void)unused_index;
+    out_data[out_data_index] =
+        static_cast<OUT_CTYPE>(self_data[self_data_index]);
+  }
+}
+
 bool check_cat_args(
     executorch::aten::ArrayRef<Tensor> tensors,
     int64_t dim,
 
@@ -147,6 +147,9 @@ def define_common_targets():
             "copy_ops_util.h",
         ],
         compiler_flags = ["-Wno-missing-prototypes"],
+        exported_deps = [
+            ":broadcast_util",
+        ],
         deps = [
             "//executorch/runtime/kernel:kernel_includes",
         ],
@@ -348,7 +351,6 @@ def define_common_targets():
             ],
         )
 
-
         runtime.cxx_library(
             name = "arange_util{}".format(suffix),
             srcs = ["arange_util.cpp"],