From 56f5f3e4d2c6f38fed37bb21559a5db60b137de8 Mon Sep 17 00:00:00 2001
From: Yuhan Guo <yuhanguo@fb.com>
Date: Thu, 13 Mar 2025 16:13:25 -0700
Subject: [PATCH] Move tokenizer info into pte for et

---
 runner/run.cpp                          | 24 ++++++++++++++++-
 torchchat/export.py                     | 36 ++++++++++++++++---------
 torchchat/utils/scripts/build_native.sh |  2 +-
 3 files changed, 48 insertions(+), 14 deletions(-)

diff --git a/runner/run.cpp b/runner/run.cpp
index d64c636bb..d7239764a 100644
--- a/runner/run.cpp
+++ b/runner/run.cpp
@@ -53,6 +53,9 @@ using executorch::extension::TensorPtr;
 using torch::executor::EValue;
 using torch::executor::Module;
 using torch::executor::Result;
+using executorch::runtime::MemoryManager;
+using executorch::runtime::MemoryAllocator;
+using executorch::runtime::Error;
 #endif
 
 using tokenizers::SPTokenizer;
@@ -867,7 +870,26 @@ int main(int argc, char *argv[]) {
                     : torch::Device(torch::kCUDA);
   ModelType model_type = get_model_type(std::stoi(aoti_metadata["tokenizer_type"]));
 #else // __ET_MODEL__
-  ModelType model_type = get_model_type(llama_ver);
+  Error load_status = transformer.runner->load();
+  ET_CHECK_MSG(
+      load_status == torch::executor::Error::Ok,
+      "program::load() failed with status 0x%" PRIx32,
+      static_cast<uint32_t>(load_status));
+
+  static std::array<uint8_t, 4 * 1024U * 1024U> method_allocator_pool; // 4MB
+  MemoryAllocator method_allocator{MemoryAllocator(
+      sizeof(method_allocator_pool), method_allocator_pool.data())};
+  MemoryManager memory_manager(&method_allocator, nullptr);
+  auto tokenizer_method = transformer.runner->program()->load_method("tokenizer_type", &memory_manager);
+
+  Error execute_status = tokenizer_method->execute();
+  ET_CHECK_MSG(
+      execute_status == torch::executor::Error::Ok,
+      "method::execute() failed with status 0x%" PRIx32,
+      static_cast<uint32_t>(execute_status));
+
+  auto tokenizer_type = tokenizer_method->get_output(0).toInt();
+  ModelType model_type = get_model_type(tokenizer_type);
 #endif
 
   if (model_type == UNKNOWN_MODEL) {
diff --git a/torchchat/export.py b/torchchat/export.py
index bad97cd35..a1dca61b2 100644
--- a/torchchat/export.py
+++ b/torchchat/export.py
@@ -313,7 +313,7 @@ def export_to_edge(
             core_aten_ep, edge_constant_methods, edge_compile_config, verbose=verbose
         )
 
-    def export_for_et(model, device, output_path) -> str:
+    def export_for_et(model, device, output_path, edge_constant_methods) -> str:
 
         input = (
             torch.tensor([[1]], dtype=torch.long, device=device),
@@ -344,12 +344,15 @@ def export_for_et(model, device, output_path) -> str:
         with torch.nn.attention.sdpa_kernel(
             [torch.nn.attention.SDPBackend.MATH]
         ), torch.no_grad():
-            m = export_for_training(model, input, dynamic_shapes=dynamic_shapes).module()
+            m = export_for_training(
+                model, input, dynamic_shapes=dynamic_shapes
+            ).module()
 
             edge_manager = export_to_edge(
                 m,
                 input,
                 dynamic_shapes=dynamic_shapes,
+                edge_constant_methods=edge_constant_methods,
                 edge_compile_config=edge_config,
             )
         edge_manager = edge_manager.to_backend(XnnpackDynamicallyQuantizedPartitioner())
@@ -365,6 +368,7 @@ def export_for_et(model, device, output_path) -> str:
         )
 
         print("The methods are: ", export_program.methods)
+        print("The config methods are: ", export_program.config_methods)
         with open(output_path, "wb") as f:
             export_program.write_to_file(f)
 
@@ -407,7 +411,9 @@ def main(args):
             f"Warning! ExecuTorch export target is controlled by export recipe, not device setting. Ignoring device={builder_args.device} setting."
         )
         builder_args.device = "cpu"
-    elif (output_pte_path or output_dso_path or output_aoti_package_path) and "mps" in builder_args.device:
+    elif (
+        output_pte_path or output_dso_path or output_aoti_package_path
+    ) and "mps" in builder_args.device:
         print("Warning! Device MPS not supported for export. Exporting for device CPU.")
         builder_args.device = "cpu"
 
@@ -473,13 +479,26 @@ def main(args):
                 support_tensor_subclass=False,
             )
             _unset_gguf_kwargs(builder_args)
- 
+
+    if tokenizer_args is None:
+        tokenizer_type = "0"
+    elif tokenizer_args.is_sentencepiece:
+        tokenizer_type = "2"  # Corresponding to llama2
+    else:
+        tokenizer_type = "3"  # Corresponding to llama3
+
     with torch.no_grad():
         if output_pte_path:
             output_pte_path = str(os.path.abspath(output_pte_path))
             if executorch_export_available:
                 print(f"Exporting model using ExecuTorch to {output_pte_path}")
-                export_for_et(model_to_pte, builder_args.device, args.output_pte_path)
+                print(f"Tokenizer type is {tokenizer_type}")
+                export_for_et(
+                    model_to_pte,
+                    builder_args.device,
+                    args.output_pte_path,
+                    {"tokenizer_type": int(tokenizer_type)},
+                )
             else:
                 print(
                     "Export with executorch requested but ExecuTorch could not be loaded"
@@ -503,13 +522,6 @@ def main(args):
         if output_aoti_package_path:
             output_aoti_package_path = str(os.path.abspath(output_aoti_package_path))
 
-            if tokenizer_args is None:
-                tokenizer_type = "0"
-            elif tokenizer_args.is_sentencepiece:
-                tokenizer_type = "2"  # Corresponding to llama2
-            else:
-                tokenizer_type = "3"  # Corresponding to llama3
-
             metadata = {"tokenizer_type": tokenizer_type}
             print(
                 "Exporting model using AOT Inductor to " f"{output_aoti_package_path}."
diff --git a/torchchat/utils/scripts/build_native.sh b/torchchat/utils/scripts/build_native.sh
index d0e141678..0ba2f6858 100755
--- a/torchchat/utils/scripts/build_native.sh
+++ b/torchchat/utils/scripts/build_native.sh
@@ -100,4 +100,4 @@ else
 fi
 cmake --build ./cmake-out --target "${TARGET}"_run
 
-printf "Build finished. Please run: \n./cmake-out/${TARGET}_run model.<pte|so> -z tokenizer.model -l <llama version (2 or 3)> -i <prompt>\n"
+printf "Build finished. Please run: \n./cmake-out/${TARGET}_run model.<pte|so> -z tokenizer.model > -i <prompt>\n"