From 56f5f3e4d2c6f38fed37bb21559a5db60b137de8 Mon Sep 17 00:00:00 2001 From: Yuhan Guo Date: Thu, 13 Mar 2025 16:13:25 -0700 Subject: [PATCH] Move tokenizer info into pte for et --- runner/run.cpp | 24 ++++++++++++++++- torchchat/export.py | 36 ++++++++++++++++--------- torchchat/utils/scripts/build_native.sh | 2 +- 3 files changed, 48 insertions(+), 14 deletions(-) diff --git a/runner/run.cpp b/runner/run.cpp index d64c636bb..d7239764a 100644 --- a/runner/run.cpp +++ b/runner/run.cpp @@ -53,6 +53,9 @@ using executorch::extension::TensorPtr; using torch::executor::EValue; using torch::executor::Module; using torch::executor::Result; +using executorch::runtime::MemoryManager; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::Error; #endif using tokenizers::SPTokenizer; @@ -867,7 +870,26 @@ int main(int argc, char *argv[]) { : torch::Device(torch::kCUDA); ModelType model_type = get_model_type(std::stoi(aoti_metadata["tokenizer_type"])); #else // __ET_MODEL__ - ModelType model_type = get_model_type(llama_ver); + Error load_status = transformer.runner->load(); + ET_CHECK_MSG( + load_status == torch::executor::Error::Ok, + "program::load() failed with status 0x%" PRIx32, + static_cast(load_status)); + + static std::array method_allocator_pool; // 4MB + MemoryAllocator method_allocator{MemoryAllocator( + sizeof(method_allocator_pool), method_allocator_pool.data())}; + MemoryManager memory_manager(&method_allocator, nullptr); + auto tokenizer_method = transformer.runner->program()->load_method("tokenizer_type", &memory_manager); + + Error execute_status = tokenizer_method->execute(); + ET_CHECK_MSG( + execute_status == torch::executor::Error::Ok, + "method::execute() failed with status 0x%" PRIx32, + static_cast(execute_status)); + + auto tokenizer_type = tokenizer_method->get_output(0).toInt(); + ModelType model_type = get_model_type(tokenizer_type); #endif if (model_type == UNKNOWN_MODEL) { diff --git a/torchchat/export.py b/torchchat/export.py index bad97cd35..a1dca61b2 100644 --- a/torchchat/export.py +++ b/torchchat/export.py @@ -313,7 +313,7 @@ def export_to_edge( core_aten_ep, edge_constant_methods, edge_compile_config, verbose=verbose ) - def export_for_et(model, device, output_path) -> str: + def export_for_et(model, device, output_path, edge_constant_methods) -> str: input = ( torch.tensor([[1]], dtype=torch.long, device=device), @@ -344,12 +344,15 @@ def export_for_et(model, device, output_path) -> str: with torch.nn.attention.sdpa_kernel( [torch.nn.attention.SDPBackend.MATH] ), torch.no_grad(): - m = export_for_training(model, input, dynamic_shapes=dynamic_shapes).module() + m = export_for_training( + model, input, dynamic_shapes=dynamic_shapes + ).module() edge_manager = export_to_edge( m, input, dynamic_shapes=dynamic_shapes, + edge_constant_methods=edge_constant_methods, edge_compile_config=edge_config, ) edge_manager = edge_manager.to_backend(XnnpackDynamicallyQuantizedPartitioner()) @@ -365,6 +368,7 @@ def export_for_et(model, device, output_path) -> str: ) print("The methods are: ", export_program.methods) + print("The config methods are: ", export_program.config_methods) with open(output_path, "wb") as f: export_program.write_to_file(f) @@ -407,7 +411,9 @@ def main(args): f"Warning! ExecuTorch export target is controlled by export recipe, not device setting. Ignoring device={builder_args.device} setting." ) builder_args.device = "cpu" - elif (output_pte_path or output_dso_path or output_aoti_package_path) and "mps" in builder_args.device: + elif ( + output_pte_path or output_dso_path or output_aoti_package_path + ) and "mps" in builder_args.device: print("Warning! Device MPS not supported for export. Exporting for device CPU.") builder_args.device = "cpu" @@ -473,13 +479,26 @@ def main(args): support_tensor_subclass=False, ) _unset_gguf_kwargs(builder_args) - + + if tokenizer_args is None: + tokenizer_type = "0" + elif tokenizer_args.is_sentencepiece: + tokenizer_type = "2" # Corresponding to llama2 + else: + tokenizer_type = "3" # Corresponding to llama3 + with torch.no_grad(): if output_pte_path: output_pte_path = str(os.path.abspath(output_pte_path)) if executorch_export_available: print(f"Exporting model using ExecuTorch to {output_pte_path}") - export_for_et(model_to_pte, builder_args.device, args.output_pte_path) + print(f"Tokenizer type is {tokenizer_type}") + export_for_et( + model_to_pte, + builder_args.device, + args.output_pte_path, + {"tokenizer_type": int(tokenizer_type)}, + ) else: print( "Export with executorch requested but ExecuTorch could not be loaded" @@ -503,13 +522,6 @@ def main(args): if output_aoti_package_path: output_aoti_package_path = str(os.path.abspath(output_aoti_package_path)) - if tokenizer_args is None: - tokenizer_type = "0" - elif tokenizer_args.is_sentencepiece: - tokenizer_type = "2" # Corresponding to llama2 - else: - tokenizer_type = "3" # Corresponding to llama3 - metadata = {"tokenizer_type": tokenizer_type} print( "Exporting model using AOT Inductor to " f"{output_aoti_package_path}." diff --git a/torchchat/utils/scripts/build_native.sh b/torchchat/utils/scripts/build_native.sh index d0e141678..0ba2f6858 100755 --- a/torchchat/utils/scripts/build_native.sh +++ b/torchchat/utils/scripts/build_native.sh @@ -100,4 +100,4 @@ else fi cmake --build ./cmake-out --target "${TARGET}"_run -printf "Build finished. Please run: \n./cmake-out/${TARGET}_run model. -z tokenizer.model -l -i \n" +printf "Build finished. Please run: \n./cmake-out/${TARGET}_run model. -z tokenizer.model > -i \n"