Support cache positions

larryliu0820 · larryliu0820 · commit 47b121a5cada · 2025-07-11T10:39:50.000-07:00
diff --git a/examples/models/phi-3-mini/README.md b/examples/models/phi-3-mini/README.md
@@ -22,18 +22,7 @@ python -m examples.models.phi-3-mini.export_phi-3-mini -c "4k" -s 128 -o phi-3-m
 3. Build and run the model.
 - Build executorch with optimized CPU performance as follows. Build options available [here](https://github.com/pytorch/executorch/blob/main/CMakeLists.txt#L59).
  ```
- cmake -DPYTHON_EXECUTABLE=python \
-     -DCMAKE_INSTALL_PREFIX=cmake-out \
-     -DEXECUTORCH_ENABLE_LOGGING=1 \
-     -DCMAKE_BUILD_TYPE=Release \
-     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-     -DEXECUTORCH_BUILD_XNNPACK=ON \
-     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-     -Bcmake-out .
+ cmake --preset llm -DCMAKE_INSTALL_PREFIX=cmake-out
 
  cmake --build cmake-out -j16 --target install --config Release
  ```
diff --git a/examples/models/phi-3-mini/export_phi-3-mini.py b/examples/models/phi-3-mini/export_phi-3-mini.py
@@ -51,11 +51,15 @@ def export(args) -> None:
             torch.tensor(
                 [[1048, 263, 931, 746]], dtype=torch.long, requires_grad=False
             ),
+            torch.tensor([[0, 1, 2, 3]], dtype=torch.long, requires_grad=False),
         )
         dynamic_shapes = {
             "input_ids": {
                 1: torch.export.Dim("sequence_length", min=1, max=args.seq_len)
-            }
+            },
+            "cache_positions": {
+                1: torch.export.Dim("sequence_length", min=1, max=args.seq_len)
+            },
         }
 
         xnnpack_quant_config = get_symmetric_quantization_config(
diff --git a/extension/llm/runner/text_llm_runner.cpp b/extension/llm/runner/text_llm_runner.cpp
@@ -32,6 +32,7 @@ static constexpr auto kMaxContextLen = "get_max_context_len";
 static constexpr auto kVocabSize = "get_vocab_size";
 static constexpr auto kUseKVCache = "use_kv_cache";
 static constexpr auto kUseSDPAWithKVCache = "use_sdpa_with_kv_cache";
+static constexpr auto kUseCachePositions = "use_cache_positions";
 
 TextLLMRunner::TextLLMRunner(
     std::unordered_map<std::string, int64_t> metadata,
@@ -306,6 +307,7 @@ std::unordered_map<std::string, int64_t> get_llm_metadata(
       {llm::kMaxContextLen, 128},
       {llm::kUseKVCache, true},
       {llm::kUseSDPAWithKVCache, false},
+      {llm::kUseCachePositions, false},
   });
 
   // Read metadata from the model
@@ -335,7 +337,24 @@ std::unordered_map<std::string, int64_t> get_llm_metadata(
   // Set tokenizer-related metadata
   metadata[llm::kBosId] = tokenizer->bos_tok();
   metadata[llm::kVocabSize] = tokenizer->vocab_size();
-  return metadata;
+
+  // Override metadata using the module's method_meta
+  auto method_meta_result = module->method_meta("forward");
+  if (method_meta_result.error() != Error::Ok) {
+    ET_LOG(Error, "Failed reading method meta");
+    return metadata;
+  }
+  auto method_meta = method_meta_result.get();
+  // If only 1 input, we are not using kv cache
+  metadata[llm::kUseKVCache] = method_meta.num_inputs() > 1;
+
+  if (method_meta.num_inputs() == 1) {
+    return metadata;
+  }
+  // Check if we are using cache positions instead of input pos.
+  auto second_input_info = method_meta.input_tensor_meta(1).get();
+  // For input_pos, size is [1], for cache_positions, size is [1, max_seq_len]
+  metadata[llm::kUseCachePositions] = second_input_info.sizes().size() == 2;
 }
 
 std::unordered_set<uint64_t> get_eos_ids(
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
@@ -19,10 +19,12 @@ namespace llm {
 TextPrefiller::TextPrefiller(
     TextDecoderRunner* text_decoder_runner,
     bool use_kv_cache,
+    bool use_cache_positions,
     bool enable_parallel_prefill,
     int64_t max_seq_len)
     : text_decoder_runner_(text_decoder_runner),
       use_kv_cache_(use_kv_cache),
+      use_cache_positions_(use_cache_positions),
       enable_parallel_prefill_(enable_parallel_prefill),
       max_seq_len_(max_seq_len > 0 ? max_seq_len : 128) {}
 
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
@@ -21,7 +21,8 @@ class ET_EXPERIMENTAL TextPrefiller {
  public:
   TextPrefiller(
       TextDecoderRunner* text_decoder_runner,
-      bool use_kv_cache_,
+      bool use_kv_cache,
+      bool use_cache_positions,
       bool enable_parallel_prefill,
       int64_t max_seq_len = 128);
 
@@ -75,6 +76,7 @@ class ET_EXPERIMENTAL TextPrefiller {
    */
   TextDecoderRunner* text_decoder_runner_;
   bool use_kv_cache_;
+  bool use_cache_positions_;
   bool enable_parallel_prefill_;
   int64_t max_seq_len_;
 };
diff --git a/extension/tensor/tensor_ptr_maker.cpp b/extension/tensor/tensor_ptr_maker.cpp
@@ -186,5 +186,59 @@ TensorPtr randint_strided(
       std::uniform_int_distribution<int64_t>(low, high - 1));
 }
 
+TensorPtr arange(
+    executorch::aten::Scalar start,
+    executorch::aten::Scalar end,
+    executorch::aten::Scalar step,
+    executorch::aten::ScalarType type,
+    executorch::aten::TensorShapeDynamism dynamism) {
+  // Calculate the number of elements in the range
+  double start_val, end_val, step_val;
+
+  if (start.isFloatingPoint()) {
+    start_val = start.to<double>();
+  } else if (start.isIntegral(/*includeBool=*/false)) {
+    start_val = static_cast<double>(start.to<int64_t>());
+  } else {
+    ET_CHECK_MSG(false, "start must be a number");
+  }
+
+  if (end.isFloatingPoint()) {
+    end_val = end.to<double>();
+  } else if (end.isIntegral(/*includeBool=*/false)) {
+    end_val = static_cast<double>(end.to<int64_t>());
+  } else {
+    ET_CHECK_MSG(false, "end must be a number");
+  }
+
+  if (step.isFloatingPoint()) {
+    step_val = step.to<double>();
+  } else if (step.isIntegral(/*includeBool=*/false)) {
+    step_val = static_cast<double>(step.to<int64_t>());
+  } else {
+    ET_CHECK_MSG(false, "step must be a number");
+  }
+
+  ET_CHECK_MSG(step_val != 0, "step cannot be zero");
+
+  // Calculate the number of elements
+  int64_t numel =
+      static_cast<int64_t>(std::ceil((end_val - start_val) / step_val));
+  numel = std::max(int64_t(0), numel); // Ensure non-negative
+
+  // Create a 1D tensor with the calculated size
+  auto tensor = empty_strided({numel}, {1}, type, dynamism);
+
+  // Fill the tensor with values from start to end with step
+  ET_SWITCH_REALHBBF16_TYPES(type, nullptr, "arange", CTYPE, [&] {
+    CTYPE* data = tensor->mutable_data_ptr<CTYPE>();
+    for (int64_t i = 0; i < numel; ++i) {
+      data[i] = static_cast<CTYPE>(start_val + i * step_val);
+    }
+  });
+
+  return tensor;
+}
+
 } // namespace extension
 } // namespace executorch
diff --git a/extension/tensor/tensor_ptr_maker.h b/extension/tensor/tensor_ptr_maker.h
@@ -683,5 +683,41 @@ inline TensorPtr randint(
   return randint_strided(low, high, std::move(sizes), {}, type, dynamism);
 }
 
+/**
+ * Creates a 2-D tensor (sizes=[1, max]) with values from `start` to `end`
+ * (exclusive) with step size `step`.
+ *
+ * @param start The starting value of the sequence.
+ * @param end The ending value of the sequence (exclusive).
+ * @param step The step size between values in the sequence.
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+TensorPtr arange(
+    executorch::aten::Scalar start,
+    executorch::aten::Scalar end,
+    executorch::aten::Scalar step = 1,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND);
+
+/**
+ * Creates a 2-D tensor (sizes=[1, max]) with values from 0 to `end` (exclusive)
+ * with step size 1.
+ *
+ * @param end The ending value of the sequence (exclusive).
+ * @param type The scalar type of the tensor elements.
+ * @param dynamism Specifies whether the tensor's shape is static or dynamic.
+ * @return A TensorPtr instance managing the newly created Tensor.
+ */
+inline TensorPtr arange(
+    executorch::aten::Scalar end,
+    executorch::aten::ScalarType type = executorch::aten::ScalarType::Float,
+    executorch::aten::TensorShapeDynamism dynamism =
+        executorch::aten::TensorShapeDynamism::DYNAMIC_BOUND) {
+  return arange(0, end, 1, type, dynamism);
+}
+
 } // namespace extension
 } // namespace executorch
diff --git a/extension/tensor/test/tensor_ptr_maker_test.cpp b/extension/tensor/test/tensor_ptr_maker_test.cpp
@@ -506,3 +506,90 @@ TEST_F(TensorPtrMakerTest, CreateRandnTensorWithIntType) {
     EXPECT_EQ(val, 0);
   }
 }
+
+TEST_F(TensorPtrMakerTest, CreateArangeTensorWithDefaultStartAndStep) {
+  auto tensor = arange(5);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 5);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<float>()[i];
+    EXPECT_EQ(val, static_cast<float>(i));
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateArangeTensorWithStartEndStep) {
+  auto tensor = arange(2, 10, 2);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 4); // (10-2)/2 = 4 elements
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<float>()[i];
+    EXPECT_EQ(val, static_cast<float>(2 + i * 2));
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateArangeTensorWithNegativeStep) {
+  auto tensor = arange(5, 0, -1);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 5);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<float>()[i];
+    EXPECT_EQ(val, static_cast<float>(5 - i));
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateArangeTensorWithIntType) {
+  auto tensor = arange(0, 5, 1, executorch::aten::ScalarType::Int);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 5);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Int);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<int32_t>()[i];
+    EXPECT_EQ(val, i);
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateArangeTensorWithLongType) {
+  auto tensor = arange(0, 5, 1, executorch::aten::ScalarType::Long);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 5);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Long);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<int64_t>()[i];
+    EXPECT_EQ(val, static_cast<int64_t>(i));
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateArangeTensorWithDoubleType) {
+  auto tensor = arange(0.5, 5.5, 0.5, executorch::aten::ScalarType::Double);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 10); // (5.5-0.5)/0.5 = 10 elements
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Double);
+
+  for (auto i = 0; i < tensor->numel(); ++i) {
+    auto val = tensor->const_data_ptr<double>()[i];
+    EXPECT_DOUBLE_EQ(val, 0.5 + i * 0.5);
+  }
+}
+
+TEST_F(TensorPtrMakerTest, CreateArangeTensorWithEmptyRange) {
+  // End < start with positive step should give empty tensor
+  auto tensor = arange(5, 0, 1);
+
+  EXPECT_EQ(tensor->dim(), 1);
+  EXPECT_EQ(tensor->size(0), 0);
+  EXPECT_EQ(tensor->scalar_type(), executorch::aten::ScalarType::Float);
+}