Make export llama checkpoint and param optional

jackzhxng · web-flow · commit de0f6f12a490 · 2025-03-22T02:58:10.000-07:00
Differential Revision: D71404805 Pull Request resolved: #9456
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -124,26 +124,19 @@ def verbose_export():
 
 
 def build_model(
-    modelname: str = "llama3",
-    extra_opts: str = "",
-    *,
-    par_local_output: bool = False,
-    resource_pkg_name: str = __name__,
+    model: str,
+    checkpoint: str,
+    params: str,
+    output_dir: Optional[str] = ".",
+    extra_opts: Optional[str] = "",
 ) -> str:
-    if False:  # par_local_output:
-        output_dir_path = "par:."
-    else:
-        output_dir_path = "."
-
-    argString = f"--model {modelname} --checkpoint par:model_ckpt.pt --params par:model_params.json {extra_opts} --output-dir {output_dir_path}"
+    argString = f"--model {model} --checkpoint {checkpoint} --params {params} {extra_opts} --output-dir {output_dir}"
     parser = build_args_parser()
     args = parser.parse_args(shlex.split(argString))
-    # pkg_name = resource_pkg_name
     return export_llama(args)
 
 
 def build_args_parser() -> argparse.ArgumentParser:
-    ckpt_dir = f"{Path(__file__).absolute().parent.as_posix()}"
     parser = argparse.ArgumentParser()
     parser.add_argument("-o", "--output-dir", default=".", help="output directory")
     # parser.add_argument(
@@ -192,8 +185,8 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "-c",
         "--checkpoint",
-        default=f"{ckpt_dir}/params/demo_rand_params.pth",
-        help="checkpoint path",
+        required=False,
+        help="Path to the checkpoint .pth file. When not provided, the model will be initialized with random weights.",
     )
 
     parser.add_argument(
@@ -274,8 +267,8 @@ def build_args_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "-p",
         "--params",
-        default=f"{ckpt_dir}/params/demo_config.json",
-        help="config.json",
+        required=False,
+        help="Config file for model parameters. When not provided, the model will fallback on default values defined in examples/models/llama/model_args.py.",
     )
     parser.add_argument(
         "--optimized_rotation_path",
@@ -562,7 +555,7 @@ def _prepare_for_llama_export(args) -> LLMEdgeManager:
     checkpoint_dir = (
         canonical_path(args.checkpoint_dir) if args.checkpoint_dir else None
     )
-    params_path = canonical_path(args.params)
+    params_path = canonical_path(args.params) if args.params else None
     output_dir_path = canonical_path(args.output_dir, dir=True)
     weight_type = WeightType.FAIRSEQ2 if args.fairseq2 else WeightType.LLAMA
 
@@ -985,7 +978,7 @@ def _load_llama_model(
     *,
     checkpoint: Optional[str] = None,
     checkpoint_dir: Optional[str] = None,
-    params_path: str,
+    params_path: Optional[str] = None,
     use_kv_cache: bool = False,
     use_sdpa_with_kv_cache: bool = False,
     generate_full_logits: bool = False,
@@ -1012,13 +1005,6 @@ def _load_llama_model(
         An instance of LLMEdgeManager which contains the eager mode model.
     """
 
-    assert (
-        checkpoint or checkpoint_dir
-    ) and params_path, "Both checkpoint/checkpoint_dir and params can't be empty"
-    logging.info(
-        f"Loading model with checkpoint={checkpoint}, params={params_path}, use_kv_cache={use_kv_cache}, weight_type={weight_type}"
-    )
-
     if modelname in EXECUTORCH_DEFINED_MODELS:
         module_name = "llama"
         model_class_name = "Llama2Model"  # TODO: Change to "LlamaModel" in examples/models/llama/model.py.
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -38,14 +38,13 @@ def __init__(self, **kwargs):
         resource_dir = get_default_model_resource_dir(__file__)
 
         # Use single checkpoint file.
-        checkpoint_path = kwargs.get(
-            "checkpoint", resource_dir / "demo_rand_params.pth"
-        )
-        params_path = kwargs.get("params", resource_dir / "demo_config.json")
-
+        checkpoint_path = kwargs.get("checkpoint", None)
         # Check if checkpoint_dir was provided for a sharded checkpoint.
         checkpoint_dir = kwargs.get("checkpoint_dir", None)
 
+        # Params file.
+        params_path = kwargs.get("params", None)
+
         self.use_kv_cache = kwargs.get("use_kv_cache", False)
         self.use_sdpa_with_kv_cache_op = kwargs.get("use_sdpa_with_kv_cache", False)
         self.generate_full_logits = kwargs.get("generate_full_logits", False)
@@ -66,6 +65,7 @@ def __init__(self, **kwargs):
         # flake8: noqa: TOR102
         cps = []
         # Load sharded checkpoint.
+        checkpoint = {}
         if checkpoint_dir is not None:
             # Load multiple checkpoint; ignore the single path.
             checkpoint_path = None
@@ -93,7 +93,7 @@ def __init__(self, **kwargs):
                     # Do not duplicate layers shared between each checkpoint.
                     checkpoint[key] = cps[0][key]
         # Load single checkpoint.
-        else:
+        elif checkpoint_path:
             checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True)
 
         # If given checkpoint is fairseq, convert to llama checkpoint.
@@ -122,8 +122,12 @@ def __init__(self, **kwargs):
 """
             )
 
-        with open(params_path, "r") as f:
-            params = json.loads(f.read())
+        # Get optional params.
+        params = {}
+        if params_path:
+            with open(params_path, "r") as f:
+                params = json.loads(f.read())
+
         output_prune_map = None
         if self.output_prune_map_path is not None:
             with open(self.output_prune_map_path, "r") as f:
@@ -170,7 +174,11 @@ def __init__(self, **kwargs):
         with torch.device("meta"):
             # Model itself is loaded in default dtype, fp32.
             self.model_ = Transformer(model_args)
-            self.model_.checkpoint_dtype = get_checkpoint_dtype(checkpoint)
+            # Get checkpoint dtype.
+            if checkpoint:
+                self.model_.checkpoint_dtype = get_checkpoint_dtype(checkpoint)
+            else:
+                self.model_.checkpoint_dtype = None
 
         if "int8" in str(checkpoint_path):
             print("Using int8 weight-only quantization!")
@@ -244,16 +252,19 @@ def __init__(self, **kwargs):
             # Also, the checkpoint is loaded and dtype promoted to the transformer's dtype, which is
             # by default initialized to fp32. This is fine because every other supported type
             # losslessly converts to fp32, so we don't lose precision here.
-            missing, unexpected = self.model_.load_state_dict(
-                checkpoint,
-                strict=False,
-                assign=True,
-            )  # self.model_ = Transformer(gptconf)
+            if checkpoint:
+                missing, unexpected = self.model_.load_state_dict(
+                    checkpoint,
+                    strict=False,
+                    assign=True,
+                )  # self.model_ = Transformer(gptconf)
+            else:
+                print("Checkpoint not provided, defaulting to uninitialized weights.")
+                self.model_.to_empty(device="cpu")
         except RuntimeError as e:
             print(
-                "Could not load checkpoint into mode, defaulting to random uninitialized weights."
+                f"Could not load checkpoint into mode and will default to uninitialized weights due to error: {e}."
             )
-            print(f"Error: {e}")
             # Need to provide concrete (empty) values for meta-initialized tensors for quantization.
             self.model_.to_empty(device="cpu")
 
diff --git a/examples/models/llama/model_args.py b/examples/models/llama/model_args.py
@@ -8,7 +8,7 @@ class ModelArgs:
     n_layers: int = 32
     n_heads: int = 32
     n_kv_heads: Optional[int] = None
-    vocab_size: int = -1  # defined later by tokenizer
+    vocab_size: int = 512  # Arbitrary value, should be defined later by tokenizer.
     hidden_dim: Optional[int] = None
     head_dim: Optional[int] = None  # Optional customized head_dim
     multiple_of: int = 256  # make SwiGLU hidden layer size multiple of large power of 2
diff --git a/examples/models/llava/export_llava.py b/examples/models/llava/export_llava.py
@@ -98,7 +98,17 @@ def forward(self, input_pos, embeddings):
     dtype_override = DType.fp32
     parser = build_args_parser()
     args = parser.parse_args(
-        ["-X", "-qmode", "8da4w", "--group_size", "128", "--embedding-quantize", "4,32"]
+        [
+            "-p",
+            "params.json",
+            "-X",
+            "-qmode",
+            "8da4w",
+            "--group_size",
+            "128",
+            "--embedding-quantize",
+            "4,32",
+        ]
     )
     quant_transform = get_quant_weight_transform(args, dtype_override)
     _, quantizers, _ = get_quantizer_and_quant_params(args)