[aoti] Add cpp packaging for aoti + loading in python

angelayi · angelayi · commit 9a53478b7a77 · 2024-09-11T16:07:24.000-07:00
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@ __pycache__/
 # C extensions
 *.so
 
+.vscode
 .model-artifacts/
 .venv
 .torchchat
@@ -24,3 +25,6 @@ system_info.txt
 
 # intermediate system file
 .DS_Store
+checkpoints/
+exportedModels/
+cmake-out/
diff --git a/README.md b/README.md
@@ -260,8 +260,9 @@ that is then loaded for inference. This can be done with both Python and C++ env
 
 The following example exports and executes the Llama3.1 8B Instruct
 model.  The first command compiles and performs the actual export.
-```
-python3 torchchat.py export llama3.1 --output-dso-path exportedModels/llama3.1.so
+
+```bash
+python3 torchchat.py export llama3.1 --output-aoti-package-path exportedModels/llama3_1_artifacts
 ```
 
 > [!NOTE]
@@ -275,7 +276,7 @@ case visit our [customization guide](docs/model_customization.md).
 
 To run in a python enviroment, use the generate subcommand like before, but include the dso file.
 
-```
+```bash
 python3 torchchat.py generate llama3.1 --dso-path exportedModels/llama3.1.so --prompt "Hello my name is"
 ```
 **Note:** Depending on which accelerator is used to generate the .dso file, the command may need the device specified: `--device (cuda | cpu)`.
@@ -288,9 +289,14 @@ To run in a C++ enviroment, we need to build the runner binary.
 torchchat/utils/scripts/build_native.sh aoti
 ```
 
-Then run the compiled executable, with the exported DSO from earlier.
+To compile the AOTI generated artifacts into a `.so`:
+```bash
+make -C exportedModels/llama3_1_artifacts
+```
+
+Then run the compiled executable, with the compiled DSO.
 ```bash
-cmake-out/aoti_run exportedModels/llama3.1.so -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time"
+cmake-out/aoti_run exportedModels/llama3_1_artifacts/llama3_1_artifacts.so -z `python3 torchchat.py where llama3.1`/tokenizer.model -l 3 -i "Once upon a time"
 ```
 **Note:** Depending on which accelerator is used to generate the .dso file, the runner may need the device specified: `-d (CUDA | CPU)`.
 
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -53,6 +53,7 @@ class BuilderArgs:
     gguf_path: Optional[Union[Path, str]] = None
     gguf_kwargs: Optional[Dict[str, Any]] = None
     dso_path: Optional[Union[Path, str]] = None
+    aoti_package_path: Optional[Union[Path, str]] = None
     pte_path: Optional[Union[Path, str]] = None
     device: Optional[str] = None
     precision: torch.dtype = torch.float32
@@ -72,28 +73,29 @@ def __post_init__(self):
             or (self.checkpoint_dir and self.checkpoint_dir.is_dir())
             or (self.gguf_path and self.gguf_path.is_file())
             or (self.dso_path and Path(self.dso_path).is_file())
+            or (self.aoti_package_path and Path(self.aoti_package_path).is_file())
             or (self.pte_path and Path(self.pte_path).is_file())
         ):
             raise RuntimeError(
                 "need to specified a valid checkpoint path, checkpoint dir, gguf path, DSO path, or PTE path"
             )
 
-        if self.dso_path and self.pte_path:
-            raise RuntimeError("specify either DSO path or PTE path, but not both")
+        if self.pte_path and self.aoti_package_path:
+            raise RuntimeError("specify either AOTI Package path or PTE path, but not more than one")
 
-        if self.checkpoint_path and (self.dso_path or self.pte_path):
+        if self.checkpoint_path and (self.pte_path or self.aoti_package_path):
             print(
-                "Warning: checkpoint path ignored because an exported DSO or PTE path specified"
+                "Warning: checkpoint path ignored because an exported AOTI or PTE path specified"
             )
-        if self.checkpoint_dir and (self.dso_path or self.pte_path):
+        if self.checkpoint_dir and (self.pte_path or self.aoti_package_path):
             print(
-                "Warning: checkpoint dir ignored because an exported DSO or PTE path specified"
+                "Warning: checkpoint dir ignored because an exported AOTI or PTE path specified"
             )
-        if self.gguf_path and (self.dso_path or self.pte_path):
+        if self.gguf_path and (self.pte_path or self.aoti_package_path):
             print(
-                "Warning: GGUF path ignored because an exported DSO or PTE path specified"
+                "Warning: GGUF path ignored because an exported AOTI or PTE path specified"
             )
-        if not (self.dso_path) and not (self.pte_path):
+        if not (self.dso_path) and not (self.aoti_package_path):
             self.prefill_possible = True
 
     @classmethod
@@ -123,6 +125,7 @@ def from_args(cls, args):  # -> BuilderArgs:
 
         dso_path = getattr(args, "dso_path", None)
         pte_path = getattr(args, "pte_path", None)
+        aoti_package_path = getattr(args, "aoti_package_path", None)
 
         is_chat_model = False
         if args.is_chat_model:
@@ -133,6 +136,7 @@ def from_args(cls, args):  # -> BuilderArgs:
                 checkpoint_dir,
                 dso_path,
                 pte_path,
+                aoti_package_path,
                 args.gguf_path,
             ]:
                 if path is not None:
@@ -148,6 +152,7 @@ def from_args(cls, args):  # -> BuilderArgs:
 
 
         output_pte_path = getattr(args, "output_pte_path", None)
+        output_aoti_package_path = getattr(args, "output_aoti_package_path", None)
         output_dso_path = getattr(args, "output_dso_path", None)
         if output_pte_path and args.dtype.startswith("fast"):
             if args.dtype == "fast":
@@ -169,10 +174,11 @@ def from_args(cls, args):  # -> BuilderArgs:
             gguf_path=args.gguf_path,
             gguf_kwargs=None,
             dso_path=dso_path,
+            aoti_package_path=aoti_package_path,
             pte_path=pte_path,
             device=args.device,
             precision=dtype,
-            setup_caches=(output_dso_path or output_pte_path),
+            setup_caches=(output_dso_path or output_pte_path or output_aoti_package_path),
             use_distributed=args.distributed,
             is_chat_model=is_chat_model,
             dynamic_shapes=getattr(args, "dynamic_shapes", False),
@@ -187,6 +193,7 @@ def from_speculative_args(cls, args):  # -> BuilderArgs:
         speculative_builder_args.checkpoint_path = args.draft_checkpoint_path
         speculative_builder_args.gguf_path = None
         speculative_builder_args.dso_path = None
+        speculative_builder_args.aoti_package_path = None
         speculative_builder_args.pte_path = None
         return speculative_builder_args
 
@@ -466,11 +473,12 @@ def _initialize_model(
 ):
     print("Loading model...")
 
-    if builder_args.gguf_path and (builder_args.dso_path or builder_args.pte_path):
+    if builder_args.gguf_path and (builder_args.dso_path or builder_args.aoti_package_path or builder_args.pte_path):
         print("Setting gguf_kwargs for generate.")
         is_dso = builder_args.dso_path is not None
+        is_aoti_package = builder_args.aoti_package_path is not None
         is_pte = builder_args.pte_path is not None
-        assert not (is_dso and is_pte)
+        assert not (is_dso and is_aoti_package and is_pte)
         assert builder_args.gguf_kwargs is None
         # TODO: make GGUF load independent of backend
         # currently not working because AVX int_mm broken
@@ -504,6 +512,36 @@ def _initialize_model(
             )
         except:
             raise RuntimeError(f"Failed to load AOTI compiled {builder_args.dso_path}")
+    
+    elif builder_args.aoti_package_path:
+        if not is_cuda_or_cpu_device(builder_args.device):
+            print(
+                f"Cannot load specified PT2 to {builder_args.device}. Attempting to load model to CPU instead"
+            )
+            builder_args.device = "cpu"
+
+        # assert (
+        #     quantize is None or quantize == "{ }"
+        # ), "quantize not valid for exported PT2 model. Specify quantization during export."
+
+        with measure_time("Time to load model: {time:.02f} seconds"):
+            model = _load_model(builder_args, only_config=True)
+            device_sync(device=builder_args.device)
+
+        try:
+            # Replace model forward with the AOT-compiled forward
+            # This is a hacky way to quickly demo AOTI's capability.
+            # model is still a Python object, and any mutation to its
+            # attributes will NOT be seen on by AOTI-compiled forward
+            # function, e.g. calling model.setup_cache will NOT touch
+            # AOTI compiled and maintained model buffers such as kv_cache.
+            from torch._inductor.package import load_package
+            model.forward = load_package(
+                str(builder_args.aoti_package_path.absolute()), builder_args.device
+            )
+        except:
+            raise RuntimeError(f"Failed to load AOTI compiled {builder_args.aoti_package_path}")
+
     elif builder_args.pte_path:
         if not is_cpu_device(builder_args.device):
             print(
diff --git a/torchchat/cli/cli.py b/torchchat/cli/cli.py
@@ -191,6 +191,12 @@ def _add_export_output_path_args(parser) -> None:
         default=None,
         help="Output to the specified AOT Inductor .dso model file",
     )
+    output_path_parser.add_argument(
+        "--output-aoti-package-path",
+        type=str,
+        default=None,
+        help="Output directory for AOTInductor compiled artifacts",
+    )
 
 
 def _add_export_args(parser) -> None:
@@ -220,6 +226,12 @@ def _add_exported_input_path_args(parser) -> None:
         default=None,
         help="Use the specified AOT Inductor .dso model file",
     )
+    exclusive_parser.add_argument(
+        "--aoti-package-path",
+        type=Path,
+        default=None,
+        help="Use the specified directory containing AOT Inductor compiled files",
+    )
     exclusive_parser.add_argument(
         "--pte-path",
         type=Path,
diff --git a/torchchat/export.py b/torchchat/export.py
@@ -35,8 +35,10 @@
 def export_for_server(
     model: nn.Module,
     device: Optional[str] = "cpu",
-    output_path: str = "model.dso",
+    output_path: str = "model.pt2",
     dynamic_shapes: bool = False,
+    package: bool = True,
+    model_key: str = "",
 ) -> str:
     """
     Export the model using AOT Compile to get a .dso for server use cases.
@@ -65,14 +67,17 @@ def export_for_server(
         dynamic_shapes = None
 
     with torch.nn.attention.sdpa_kernel([torch.nn.attention.SDPBackend.MATH]):
-        so = torch._export.aot_compile(
+        path = torch._export.aot_compile(
             model,
             args=input,
-            options={"aot_inductor.output_path": output_path},
+            options={
+                "aot_inductor.output_path": output_path,
+                "aot_inductor.package": package,
+            },
             dynamic_shapes=dynamic_shapes,
         )
-    print(f"The generated DSO model can be found at: {so}")
-    return so
+    print(f"The generated DSO model can be found at: {path}")
+    return path
 
 
 """
@@ -335,14 +340,16 @@ def main(args):
 
     print(f"Using device={builder_args.device}")
     set_precision(builder_args.precision)
-    set_backend(dso=args.output_dso_path, pte=args.output_pte_path)
+    set_backend(dso=args.output_dso_path, pte=args.output_pte_path, aoti_package=args.output_aoti_package_path)
 
     builder_args.dso_path = None
     builder_args.pte_path = None
+    builder_args.aoti_package_path = None
     builder_args.setup_caches = True
 
     output_pte_path = args.output_pte_path
     output_dso_path = args.output_dso_path
+    output_aoti_package_path = args.output_aoti_package_path
 
     if output_pte_path and builder_args.device != "cpu":
         print(
@@ -380,6 +387,7 @@ def main(args):
         )
         model_to_pte = model
         model_to_dso = model
+        model_to_aoti_package = model
     else:
         if output_pte_path:
             _set_gguf_kwargs(builder_args, is_et=True, context="export")
@@ -389,13 +397,14 @@ def main(args):
             )
             _unset_gguf_kwargs(builder_args)
 
-        if output_dso_path:
+        if output_dso_path or output_aoti_package_path:
             _set_gguf_kwargs(builder_args, is_et=False, context="export")
-            model_to_dso = _initialize_model(
+            model_to_aoti_package = _initialize_model(
                 builder_args,
                 quantize,
                 support_tensor_subclass=False,
             )
+            model_to_dso = model_to_aoti_package
             _unset_gguf_kwargs(builder_args)
 
     with torch.no_grad():
@@ -409,6 +418,7 @@ def main(args):
                     "Export with executorch requested but ExecuTorch could not be loaded"
                 )
                 print(executorch_exception)
+                
         if output_dso_path:
             output_dso_path = str(os.path.abspath(output_dso_path))
             print(f"Exporting model using AOT Inductor to {output_dso_path}")
@@ -417,4 +427,17 @@ def main(args):
                 builder_args.device,
                 output_dso_path,
                 builder_args.dynamic_shapes,
+                package=False,
+            )
+
+        if output_aoti_package_path:
+            output_aoti_package_path = str(os.path.abspath(output_aoti_package_path))
+            print(f"Exporting model using AOT Inductor to {output_aoti_package_path}")
+            export_for_server(
+                model_to_aoti_package,
+                builder_args.device,
+                output_aoti_package_path,
+                builder_args.dynamic_shapes,
+                package=True,
+                model_key=builder_args.params_table,
             )
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -133,8 +133,8 @@ def validate_build(
             reason = "model compilation for prefill"
         if self.compile:
             reason = "model compilation"
-        if builder_args.dso_path:
-            model_type = "DSO"
+        if builder_args.aoti_package_path:
+            model_type = "PT2"
         if builder_args.pte_path:
             model_type = "PTE"
         if model_type and reason:
@@ -146,7 +146,10 @@ def validate_build(
     def from_args(cls, args):
         dso_path = getattr(args, "dso_path", None)
         pte_path = getattr(args, "pte_path", None)
-        sequential_prefill = args.sequential_prefill or bool(dso_path) or bool(pte_path)
+        aoti_package_path = getattr(args, "aoti_package_path", None)
+        sequential_prefill = (
+            args.sequential_prefill or bool(aoti_package_path) or bool(pte_path)
+        )
 
         return cls(
             prompt=getattr(args, "prompt", ""),
@@ -948,3 +951,13 @@ def main(args):
         torch.cuda.reset_peak_memory_stats()
     for _ in gen.chat(generator_args):
         pass
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="torchchat generate CLI")
+    verb = "generate"
+    add_arguments_for_verb(parser, verb)
+    args = parser.parse_args()
+    check_args(args, verb)
+    args = arg_init(args)
+    main(args)
diff --git a/torchchat/usages/eval.py b/torchchat/usages/eval.py
@@ -260,7 +260,7 @@ def main(args) -> None:
 
     if compile:
         assert not (
-            builder_args.dso_path or builder_args.pte_path
+            builder_args.dso_path or builder_args.pte_path or builder_args.aoti_package_path
         ), "cannot compile exported model"
         model_forward = torch.compile(
             model_forward, mode="reduce-overhead", dynamic=True, fullgraph=True
@@ -288,6 +288,8 @@ def main(args) -> None:
     )
     if builder_args.dso_path:
         print(f"For model {builder_args.dso_path}")
+    if builder_args.aoti_package_path:
+        print(f"For model {builder_args.aoti_package_path}")
     elif builder_args.pte_path:
         print(f"For model {builder_args.pte_path}")
     elif builder_args.checkpoint_path:
diff --git a/torchchat/utils/build_utils.py b/torchchat/utils/build_utils.py