Merge branch 'main' into main

wjunLu · web-flow · commit 3b52e0edf3b4 · 2024-11-07T09:53:02.000+08:00
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -536,6 +536,15 @@ def _load_model(builder_args: BuilderArgs) -> Model:
         model = _load_model_default(builder_args)
     # model = _maybe_parallelize_model(model, builder_args, world_mesh, parallel_dims)
 
+    if builder_args.dso_path or builder_args.aoti_package_path:
+        # AOTI-compoiled model will load its own weights.
+        # Release weights here to avoid OOM
+        import gc
+        if hasattr(model, "model"):
+            model.model = None
+        gc.collect()
+        torch.cuda.empty_cache()
+
     model = model.to(device=builder_args.device, dtype=builder_args.precision)
     return model.eval()
 
@@ -584,6 +593,12 @@ def _initialize_model(
             # attributes will NOT be seen on by AOTI-compiled forward
             # function, e.g. calling model.setup_cache will NOT touch
             # AOTI compiled and maintained model buffers such as kv_cache.
+            # Using cpp runner to run AOTI compiled model is recommended.
+
+            def do_nothing(max_batch_size, max_seq_length):
+                pass
+            model.setup_caches = do_nothing
+
             model.forward = torch._export.aot_load(
                 str(builder_args.dso_path.absolute()), builder_args.device
             )
@@ -617,6 +632,11 @@ def _initialize_model(
             aoti_compiled_model = load_package(
                 str(builder_args.aoti_package_path.absolute())
             )
+
+            def do_nothing(max_batch_size, max_seq_length):
+                pass
+            model.setup_caches = do_nothing
+
             model.forward = aoti_compiled_model
             metadata = aoti_compiled_model.get_metadata()
             builder_args.device = metadata["AOTI_DEVICE_KEY"]
diff --git a/torchchat/export.py b/torchchat/export.py
@@ -78,11 +78,11 @@ def export_for_server(
             dynamic_shapes=dynamic_shapes,
             options=options,
         )
-        
+
         if package:
             from torch._inductor.package import package_aoti
             path = package_aoti(output_path, path)
-    
+
     print(f"The generated packaged model can be found at: {path}")
     return path
 
@@ -382,7 +382,7 @@ def main(args):
 
         if builder_args.max_seq_length is None:
             if (
-                output_dso_path is not None
+                (output_dso_path is not None or output_aoti_package_path is not None)
                 and not builder_args.dynamic_shapes
             ):
                 print("Setting max_seq_length to 300 for DSO export.")
diff --git a/torchchat/utils/build_utils.py b/torchchat/utils/build_utils.py
@@ -11,7 +11,7 @@
 
 from enum import Enum
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import torch
 
@@ -77,31 +77,39 @@ def unpack_packed_weights(
 def set_backend(dso, pte, aoti_package):
     global active_builder_args_dso
     global active_builder_args_pte
+    global active_builder_args_aoti_package
     active_builder_args_dso = dso
     active_builder_args_aoti_package = aoti_package
     active_builder_args_pte = pte
 
 
 class _Backend(Enum):
-    AOTI = (0,)
+    AOTI = 0
     EXECUTORCH = 1
 
 
-def _active_backend() -> _Backend:
+def _active_backend() -> Optional[_Backend]:
     global active_builder_args_dso
     global active_builder_args_aoti_package
     global active_builder_args_pte
 
-    # eager == aoti, which is when backend has not been explicitly set
-    if (not active_builder_args_pte) and (not active_builder_args_aoti_package):
-        return True
+    args = (
+        active_builder_args_dso,
+        active_builder_args_pte,
+        active_builder_args_aoti_package,
+    )
+
+    # Return None, as default
+    if not any(args):
+        return None
 
-    if active_builder_args_pte and active_builder_args_aoti_package:
+    # Catch more than one arg
+    if sum(map(bool, args)) > 1:
         raise RuntimeError(
-            "code generation needs to choose different implementations for AOTI and PTE path.  Please only use one export option, and call export twice if necessary!"
+            "Code generation needs to choose different implementations.  Please only use one export option, and call export twice if necessary!"
         )
 
-    return _Backend.AOTI if active_builder_args_pte else _Backend.EXECUTORCH
+    return _Backend.EXECUTORCH if active_builder_args_pte else _Backend.AOTI
 
 
 def use_aoti_backend() -> bool: