Merge branch 'main' into export-D79850580

shoumikhin · web-flow · commit c5d744e0d3b8 · 2025-08-09T17:11:28.000-07:00
diff --git a/examples/models/llama/evaluate/eager_eval.py b/examples/models/llama/evaluate/eager_eval.py
@@ -10,6 +10,7 @@
 import torch
 
 from lm_eval.models.huggingface import HFLM as eval_wrapper
+from pytorch_tokenizers.hf_tokenizer import HuggingFaceTokenizer
 from pytorch_tokenizers.llama2c import Llama2cTokenizer as SentencePieceTokenizer
 from pytorch_tokenizers.tiktoken import TiktokenTokenizer as Tiktoken
 
@@ -24,7 +25,7 @@ class EagerEvalWrapper(eval_wrapper):
     def __init__(
         self,
         model: nn.Module,
-        tokenizer: Union[SentencePieceTokenizer, Tiktoken],
+        tokenizer: Union[SentencePieceTokenizer, Tiktoken, HuggingFaceTokenizer],
         max_seq_length: Optional[int] = None,
         use_kv_cache: bool = False,
     ):
diff --git a/examples/qualcomm/oss_scripts/llama/TARGETS b/examples/qualcomm/oss_scripts/llama/TARGETS
@@ -15,10 +15,30 @@ python_library(
     ],
 )
 
+python_library(
+    name = "decoder_utils",
+    srcs = [
+        "decoder_utils.py",
+    ],
+    deps = [
+        "//caffe2:torch",
+        "//executorch/examples/models/llama:eval_library",
+    ],
+)
+
+python_library(
+    name = "decoder_constants",
+    srcs = [
+        "decoder_constants.py",
+    ],
+)
+
 python_library(
     name = "llama_lib",
     srcs = ["llama.py"],
     deps = [
+        ":decoder_constants",
+        ":decoder_utils",
         "//executorch/examples/models/llama:source_transformation",
         "//caffe2:torch",
         "//executorch/backends/qualcomm/partition:partition",
diff --git a/examples/qualcomm/oss_scripts/llama/decoder_utils.py b/examples/qualcomm/oss_scripts/llama/decoder_utils.py
@@ -44,14 +44,15 @@ def __init__(
         tokenizer: Union[
             SentencePieceTokenizer, TiktokenTokenizer, HuggingFaceTokenizer
         ],
-        max_seq_length: Optional[int],
+        max_seq_length: int,
         ar_len: int,
         use_kv_cache: bool,
         get_example_inputs: Callable,
         kv_updater: Callable,
         use_i64_token: bool,
     ):
         # n seq len = n-1 cache len, so we len(inps) = n-1 during _model_call
+        assert max_seq_length is not None, "max_seq_length must be provided"
         super().__init__(
             model=model, tokenizer=tokenizer, max_seq_length=max_seq_length - 1
         )
@@ -119,8 +120,10 @@ def __init__(
         for method in program.execution_plan:
             # Don't use tokenizer.n_words, the numbers are off once calling get_tokenizer()
             if method.name == "get_vocab_size":
+                # pyre-ignore
                 self.output_vocab_size = method.values[0].val.int_val
             if method.name == "get_max_seq_len":
+                # pyre-ignore
                 pte_max_seq_len = method.values[0].val.int_val
         assert self.output_vocab_size is not None, "Couldn't find the vocab size"
         assert pte_max_seq_len is not None, "Couldn't find the max_seq_len from pte"
@@ -156,6 +159,7 @@ def __init__(
         )
         self.adb.push(inputs=[], input_list="", files=[self.runtime_tokenizer_path])
         # n seq len = n-1 cache len, so we len(inps) = n-1 during _model_call
+        # pyre-ignore
         super().__init__(None, tokenizer, max_seq_length - 1)
 
     def _model_call(self, inps):
@@ -278,6 +282,7 @@ def kv_inference(
         else:
             raise RuntimeError("Unknown tokenizer")
     else:
+        # pyre-ignore
         token_list = prompt.flatten().tolist()
     pos = len(token_list) if len(token_list) < ar_len else ar_len
     dtype = torch.int64 if use_i64_token else torch.int32
@@ -359,6 +364,7 @@ def prefill_inference(
         else:
             raise RuntimeError("Unknown tokenizer")
     else:
+        # pyre-ignore
         token_list = prompt.flatten().tolist()
 
     pos = len(token_list)
@@ -405,7 +411,7 @@ def graph_module_inference(
     max_seq_len=512,
     kv_updater=smart_mask_updater,
     use_i64_token=False,
-    event_name: str = None,
+    event_name: Optional[str] = None,
 ):
     if args.tasks is None:
         if use_kv_cache:
diff --git a/extension/module/module.cpp b/extension/module/module.cpp
@@ -210,7 +210,6 @@ runtime::Error Module::load_method(
         method_holder.memory_manager.get(),
         event_tracer ? event_tracer : this->event_tracer(),
         data_map_.get()));
-    method_holder.inputs.resize(method_holder.method->inputs_size());
     methods_.emplace(method_name, std::move(method_holder));
   }
   return runtime::Error::Ok;
@@ -233,28 +232,10 @@ runtime::Result<std::vector<runtime::EValue>> Module::execute(
     const std::vector<runtime::EValue>& input_values) {
   ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name));
   auto& method = methods_.at(method_name).method;
-  auto& inputs = methods_.at(method_name).inputs;
-
-  ET_CHECK_OR_RETURN_ERROR(
-      input_values.size() <= inputs.size(),
-      InvalidArgument,
-      "input size: %zu does not match method input size: %zu",
-      input_values.size(),
-      inputs.size());
-  for (size_t i = 0; i < input_values.size(); ++i) {
-    if (!input_values[i].isNone()) {
-      inputs[i] = input_values[i];
-    }
+  for (auto index = 0; index < input_values.size(); ++index) {
+    ET_CHECK_OK_OR_RETURN_ERROR(method->set_input(input_values[index], index));
   }
-  for (size_t i = 0; i < inputs.size(); ++i) {
-    ET_CHECK_OR_RETURN_ERROR(
-        !inputs[i].isNone(), InvalidArgument, "input %zu is none", i);
-  }
-  ET_CHECK_OK_OR_RETURN_ERROR(
-      method->set_inputs(executorch::aten::ArrayRef<runtime::EValue>(
-          inputs.data(), inputs.size())));
   ET_CHECK_OK_OR_RETURN_ERROR(method->execute());
-
   const auto outputs_size = method->outputs_size();
   std::vector<runtime::EValue> outputs(outputs_size);
   ET_CHECK_OK_OR_RETURN_ERROR(
@@ -268,23 +249,17 @@ runtime::Error Module::set_input(
     const runtime::EValue& input_value,
     size_t input_index) {
   ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name));
-  methods_.at(method_name).inputs.at(input_index) = input_value;
-  return runtime::Error::Ok;
+  auto& method = methods_.at(method_name).method;
+  return method->set_input(input_value, input_index);
 }
 
 runtime::Error Module::set_inputs(
     const std::string& method_name,
     const std::vector<runtime::EValue>& input_values) {
   ET_CHECK_OK_OR_RETURN_ERROR(load_method(method_name));
-  auto& inputs = methods_.at(method_name).inputs;
-  ET_CHECK_OR_RETURN_ERROR(
-      inputs.size() == input_values.size(),
-      InvalidArgument,
-      "input size: %zu does not match method input size: %zu",
-      input_values.size(),
-      inputs.size());
-  inputs = input_values;
-  return runtime::Error::Ok;
+  auto& method = methods_.at(method_name).method;
+  return method->set_inputs(executorch::aten::ArrayRef<runtime::EValue>(
+      input_values.data(), input_values.size()));
 }
 
 runtime::Error Module::set_output(
diff --git a/extension/module/module.h b/extension/module/module.h
@@ -522,7 +522,6 @@ class Module {
     std::unique_ptr<runtime::HierarchicalAllocator> planned_memory;
     std::unique_ptr<runtime::MemoryManager> memory_manager;
     std::unique_ptr<Method> method;
-    std::vector<runtime::EValue> inputs;
   };
 
   std::string file_path_;
diff --git a/extension/module/test/module_test.cpp b/extension/module/test/module_test.cpp
@@ -267,7 +267,7 @@ TEST_F(ModuleTest, TestForward) {
   EXPECT_TENSOR_CLOSE(result->at(0).toTensor(), *expected.get());
 
   auto tensor2 = make_tensor_ptr({2, 2}, {2.f, 3.f, 4.f, 5.f});
-  const auto result2 = module->forward({tensor2, tensor2});
+  const auto result2 = module->forward({tensor2, tensor2, 1.0});
   EXPECT_EQ(result2.error(), Error::Ok);
 
   const auto expected2 = make_tensor_ptr({2, 2}, {4.f, 6.f, 8.f, 10.f});
diff --git a/extension/pybindings/pybindings.cpp b/extension/pybindings/pybindings.cpp
@@ -358,7 +358,7 @@ class Module final {
 
     MallocMemoryAllocator runtime_allocator_;
 
-    MemoryAllocator temp_allocator_{MemoryAllocator(0, nullptr)};
+    MallocMemoryAllocator temp_allocator_{};
 
     std::vector<std::vector<uint8_t>> non_const_buffers_;
 
@@ -1061,7 +1061,7 @@ class ProgramMemory {
 
   MallocMemoryAllocator runtime_allocator_;
 
-  MemoryAllocator temp_allocator_{MemoryAllocator(0, nullptr)};
+  MallocMemoryAllocator temp_allocator_{};
 
   std::vector<std::vector<uint8_t>> non_const_buffers_;
 
diff --git a/test/end2end/test_temp_allocator_fix.py b/test/end2end/test_temp_allocator_fix.py