[aoti] Remove need for -l in cmake

angelayi · angelayi · commit 247fd20458f2 · 2024-11-05T16:33:05.000-08:00
diff --git a/README.md b/README.md
@@ -332,7 +332,7 @@ torchchat/utils/scripts/build_native.sh aoti
 
 Then run the compiled executable, with the pt2.
 ```bash
-cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time"
+cmake-out/aoti_run exportedModels/llama3_1_artifacts.pt2 -z `python3 torchchat.py where llama3.1`/tokenizer.model -i "Once upon a time"
 ```
 
 ## Mobile Execution
diff --git a/runner/run.cpp b/runner/run.cpp
@@ -32,8 +32,6 @@ LICENSE file in the root directory of this source tree.
 
 #ifdef __AOTI_MODEL__
 #include <torch/csrc/inductor/aoti_package/model_package_loader.h>
-torch::Device aoti_device(torch::kCPU);
-
 #else // __ET_MODEL__
 #include <executorch/extension/module/module.h>
 #include <executorch/extension/tensor/tensor_ptr.h>
@@ -89,9 +87,11 @@ typedef struct {
 typedef struct {
   Config config; // the hyperparameters of the architecture (the blueprint)
   RunState state; // buffers for the "wave" of activations in the forward pass
+  std::unordered_map<std::string, std::string> metadata;
 
 #ifdef __AOTI_MODEL__
   torch::inductor::AOTIModelPackageLoader* runner;
+  
 #else // __ET_MODEL__
   Module* runner;
 #endif
@@ -130,19 +130,9 @@ void read_checkpoint(char* checkpoint, Config* config) {
 
 void build_transformer(
     Transformer* t,
-    char* model_path,
-    int vocab_size,
-    int seq_len) {
-  // read in the Config and the Weights from the model
-  // read_checkpoint(model_path, &t->config);
-  // allocate the RunState buffers
-  t->config.vocab_size = vocab_size;
-  t->config.seq_len = seq_len;
-  malloc_run_state(&t->state, &t->config);
-
+    char* model_path) {
 #ifdef __AOTI_MODEL__
   t->runner = new torch::inductor::AOTIModelPackageLoader(model_path);
-  aoti_device = t->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu" ? torch::Device(torch::kCPU) : torch::Device(torch::kCUDA);
 #else //__ET_MODEL__
   t->runner = new Module(
       /* path to PTE model */ model_path,
@@ -194,6 +184,9 @@ float* forward(Transformer* transformer, int token, int pos) {
   torch::Tensor token_tensor =
       torch::from_blob(token_buffer, {1, 1}, torch::kLong);
   torch::Tensor pos_tensor = torch::from_blob(pos_buffer, {1}, torch::kLong);
+  torch::Device aoti_device = transformer->runner->get_metadata()["AOTI_DEVICE_KEY"] == "cpu"
+    ? torch::Device(torch::kCPU)
+    : torch::Device(torch::kCUDA);
   std::vector<torch::Tensor> inputs{
       token_tensor.to(aoti_device), pos_tensor.to(aoti_device)};
 
@@ -895,26 +888,25 @@ int main(int argc, char* argv[]) {
       system_prompt = argv[i + 1];
     } else if (argv[i][1] == 'l') {
       llama_ver = atoi(argv[i + 1]);
-#ifdef __AOTI_MODEL__
-    } else if (argv[i][1] == 'd') {
-#ifdef USE_CUDA
-      if (strcasecmp(argv[i + 1], "CUDA") == 0) {
-        aoti_device = torch::Device(torch::kCUDA);
-      } else
-#endif
-          if (strcasecmp(argv[i + 1], "CPU") == 0) {
-        aoti_device = torch::Device(torch::kCPU);
-      } else {
-        fprintf(stderr, "Unknown device %s", argv[i + 1]);
-        exit(1);
-      }
-#endif
     } else {
       error_usage();
     }
   }
 
+  if (model_path == NULL) {
+    fprintf(stderr, "No model_path provided.");
+    error_usage();
+  }
+  
+  Transformer transformer;
+  build_transformer(&transformer, model_path);
+
+#ifdef __AOTI_MODEL__
+  ModelType model_type = get_model_type(std::stoi(transformer.runner->get_metadata()["tokenizer_type"]));
+#else // __ET_MODEL__
   ModelType model_type = get_model_type(llama_ver);
+#endif
+
   if (model_type == UNKNOWN_MODEL) {
     fprintf(
         stderr,
@@ -923,11 +915,6 @@ int main(int argc, char* argv[]) {
     error_usage();
   }
 
-  if (model_path == NULL) {
-    fprintf(stderr, "No model_path provided.");
-    error_usage();
-  }
-
   if (tokenizer_path == NULL) {
     fprintf(stderr, "No tokenizer_path provided.");
     error_usage();
@@ -950,8 +937,12 @@ int main(int argc, char* argv[]) {
     vocab_size = tokenizer->vocab_size();
   }
 
-  Transformer transformer;
-  build_transformer(&transformer, model_path, vocab_size, steps);
+  // read in the Config and the Weights from the model
+  // read_checkpoint(model_path, &t->config);
+  // allocate the RunState buffers
+  transformer.config.vocab_size = vocab_size;
+  transformer.config.seq_len = steps;
+  malloc_run_state(&transformer.state, &transformer.config);
 
   Sampler sampler;
   build_sampler(&sampler, vocab_size, temperature, topp, rng_seed);
diff --git a/torchchat/export.py b/torchchat/export.py
@@ -5,13 +5,13 @@
 # LICENSE file in the root directory of this source tree.
 
 import os
-from typing import Optional
+from typing import Dict, Optional
 
 import torch
+import torch._inductor
 import torch.nn as nn
 
 from torch.export import Dim
-import torch._inductor
 
 from torchchat.cli.builder import (
     _initialize_model,
@@ -39,6 +39,7 @@ def export_for_server(
     output_path: str = "model.pt2",
     dynamic_shapes: bool = False,
     package: bool = True,
+    metadata: Dict[str, str] = {},
 ) -> str:
     """
     Export the model using AOT Compile to get a .dso for server use cases.
@@ -67,7 +68,6 @@ def export_for_server(
         dynamic_shapes = None
 
     with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
-        metadata = {}  # TODO: put more metadata here
         options = {"aot_inductor.package": package, "aot_inductor.metadata": metadata}
         if not package:
             options = {"aot_inductor.output_path": output_path}
@@ -81,6 +81,7 @@ def export_for_server(
 
         if package:
             from torch._inductor.package import package_aoti
+
             path = package_aoti(output_path, path)
 
     print(f"The generated packaged model can be found at: {path}")
@@ -102,13 +103,13 @@ def export_for_server(
     from typing import Any, Dict, Tuple, Union
 
     import executorch.exir as exir
+    from executorch.backends.xnnpack._passes.convert_to_linear import (
+        ConvertToLinearPass,
+    )
 
     from executorch.backends.xnnpack.partition.xnnpack_partitioner import (
         XnnpackDynamicallyQuantizedPartitioner,
     )
-    from executorch.backends.xnnpack._passes.convert_to_linear import (
-        ConvertToLinearPass,
-    )
     from executorch.exir import EdgeProgramManager, to_edge
 
     from executorch.exir.capture._config import (
@@ -166,18 +167,22 @@ def __init__(self, attention: Attention):
 
             self.wo = attention.wo
 
-            max_batch_size, n_heads, max_seq_length, head_dim = (
-                attention.kv_cache[0].k_cache.shape
-            )
+            max_batch_size, n_heads, max_seq_length, head_dim = attention.kv_cache[
+                0
+            ].k_cache.shape
             cache_dtype = attention.kv_cache[0].k_cache.dtype
             # The `Attention` module being replaced can have multiple KV caches
             # (denoted by `cache_lanes`).  Thus we follow the same setup format
             # as in `Attention.setup_cache`.
             cache_lanes = len(attention.kv_cache)
-            self.kv_cache = nn.ModuleList([
-                CustomKVCache(max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype)
-                for _ in range(cache_lanes)
-            ])
+            self.kv_cache = nn.ModuleList(
+                [
+                    CustomKVCache(
+                        max_batch_size, max_seq_length, n_heads, head_dim, cache_dtype
+                    )
+                    for _ in range(cache_lanes)
+                ]
+            )
 
             self.n_heads = attention.n_heads
             self.head_dim = attention.head_dim
@@ -215,9 +220,7 @@ def forward(self, x, freqs_cis, mask, input_pos=None, cache_lane: int = 0):
             return self.wo(output)
 
     def replace_attention_with_custom_sdpa_attention(module: nn.Module):
-        from executorch.extension.llm.custom_ops import (  # noqa
-            sdpa_with_kv_cache,
-        )
+        from executorch.extension.llm.custom_ops import sdpa_with_kv_cache  # noqa
 
         for name, child in module.named_children():
             if isinstance(child, Attention):
@@ -350,7 +353,11 @@ def main(args):
 
     print(f"Using device={builder_args.device}")
     set_precision(builder_args.precision)
-    set_backend(dso=args.output_dso_path, pte=args.output_pte_path, aoti_package=args.output_aoti_package_path)
+    set_backend(
+        dso=args.output_dso_path,
+        pte=args.output_pte_path,
+        aoti_package=args.output_aoti_package_path,
+    )
 
     builder_args.dso_path = None
     builder_args.pte_path = None
@@ -372,6 +379,7 @@ def main(args):
 
     # TODO: clean this up
     # This mess is because ET does not support _weight_int4pack_mm right now
+    tokenizer_args = None
     if not builder_args.gguf_path:
         # tokenizer needed for quantization so get that here,
         try:
@@ -382,9 +390,8 @@ def main(args):
 
         if builder_args.max_seq_length is None:
             if (
-                (output_dso_path is not None or output_aoti_package_path is not None)
-                and not builder_args.dynamic_shapes
-            ):
+                output_dso_path is not None or output_aoti_package_path is not None
+            ) and not builder_args.dynamic_shapes:
                 print("Setting max_seq_length to 300 for DSO export.")
                 builder_args.max_seq_length = 300
             elif output_pte_path is not None:
@@ -397,7 +404,8 @@ def main(args):
             quantize,
             tokenizer,
             max_seq_length=builder_args.max_seq_length,
-            support_tensor_subclass=output_dso_path is None and output_aoti_package_path is None,
+            support_tensor_subclass=output_dso_path is None
+            and output_aoti_package_path is None,
         )
         model_to_pte = model
         model_to_dso = model
@@ -435,7 +443,9 @@ def main(args):
         if output_dso_path:
             output_dso_path = str(os.path.abspath(output_dso_path))
             print(f"Exporting model using AOT Inductor to {output_dso_path}")
-            print("WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead.")
+            print(
+                "WARNING!! The path of compiling a dso is deprecated. Please use --output-aoti-package-path to create a .pt2 artifact instead."
+            )
             export_for_server(
                 model_to_dso,
                 builder_args.device,
@@ -446,11 +456,21 @@ def main(args):
 
         if output_aoti_package_path:
             output_aoti_package_path = str(os.path.abspath(output_aoti_package_path))
+
+            if tokenizer_args is None:
+                tokenizer_type = "0"
+            elif tokenizer_args.is_sentencepiece:
+                tokenizer_type = "2"  # Corresponding to llama2
+            else:
+                tokenizer_type = "3"  # Corresponding to llama3
+
+            metadata = {"tokenizer_type": tokenizer_type}
             print(f"Exporting model using AOT Inductor to {output_aoti_package_path}")
             export_for_server(
                 model_to_aoti_package,
                 builder_args.device,
                 output_aoti_package_path,
                 builder_args.dynamic_shapes,
                 package=True,
+                metadata=metadata,
             )