Remove sharded ckpt from export_llama

lucylq · lucylq · commit 8a5525fbad37 · 2025-11-24T15:29:15.000-08:00
Sharded checkpoint isn't used anymore; removing it and simplifying export_llama. Differential Revision: [D87828518](https://our.internmc.facebook.com/intern/diff/D87828518/) [ghstack-poisoned]
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
@@ -229,12 +229,6 @@ def build_args_parser() -> argparse.ArgumentParser:
         help="Path to the checkpoint .pth file. When not provided, the model will be initialized with random weights.",
     )
 
-    parser.add_argument(
-        "--checkpoint_dir",
-        default=None,
-        help="checkpoint directory. Use with a sharded checkpoint, not for the standard llama2 model. Note, checkpoint_dir takes precedence over checkpoint if both are set.",
-    )
-
     parser.add_argument(
         "--adapter_checkpoint",
         required=False,
diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py
@@ -71,38 +71,7 @@ def __init__(self, llm_config: Optional[LlmConfig] = None):
         # The example is using a dummy small model with random weights for demo purpose only.
         # Follow the instruction in https://github.com/facebookresearch/llama to download the model.
         device = "cpu"
-        # flake8: noqa: TOR102
-        cps = []
-        # Load sharded checkpoint.
-        checkpoint = {}
-        if checkpoint_dir is not None:
-            # Load multiple checkpoint; ignore the single path.
-            checkpoint_path = None
-            for i in range(4):
-                cp_name = f"consolidated.{i}.pth"
-                print(f"Loading {cp_name}")
-                cps.append(
-                    torch.load(
-                        os.path.join(checkpoint_dir, cp_name),
-                        map_location=device,
-                        mmap=True,
-                    )
-                )
-            checkpoint = {}
-            for key in cps[0].keys():
-                if not torch.allclose(cps[0][key], cps[1][key]):
-                    values = (cps[0][key], cps[1][key], cps[2][key], cps[3][key])
-                    if "wo" in key or "w2" in key:
-                        # Concat on dim=1 for "wo" and "w2".
-                        checkpoint[key] = torch.cat(values, dim=1)
-                    else:
-                        # Concat on dim=0 for everything else.
-                        checkpoint[key] = torch.cat(values, dim=0)
-                else:
-                    # Do not duplicate layers shared between each checkpoint.
-                    checkpoint[key] = cps[0][key]
-        # Load single checkpoint.
-        elif checkpoint_path:
+        if checkpoint_path:
             checkpoint = torch.load(checkpoint_path, map_location=device, mmap=True)
 
         # If given checkpoint is fairseq, convert to llama checkpoint.
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
@@ -76,7 +76,6 @@ class BaseConfig:
             If left empty, the model will either be initialized with random weights
             if it is a Llama model or the weights will be downloaded from HuggingFace
             if it is a non-Llama model.
-        checkpoint_dir: Path to directory containing sharded checkpoint files.
         adapter_checkpoint: Path to the adapter.pt file from torchtune. Used if
             the model has trained LoRA adapters. Must provide
             adapter_config.json.
@@ -87,10 +86,6 @@ class BaseConfig:
             e.g. '"{\"get_bos_id\":128000, \"get_eos_ids\":[128009, 128001]}"'
         use_lora: Only for use with QAT. Rank of the LoRA adapter, disabled
             if set to 0.
-        fairseq2: For legacy internal use cases, this is safe to ignore.
-        preq_mode: Legacy option to specify how prequantized weights are loaded.
-            Going forward, ExecuTorch supports loading weights prequantized through
-            TorchAo as-is, without any special handling.
         preq_group_size: Legacy option to specify the group size of prequantized weights.
         preq_embedding_quantize: Legacy option to specify how prequantized embeddings
             are loaded.
@@ -99,13 +94,11 @@ class BaseConfig:
     model_class: ModelType = ModelType.llama3
     params: Optional[str] = None
     checkpoint: Optional[str] = None
-    checkpoint_dir: Optional[str] = None
     adapter_checkpoint: Optional[str] = None
     adapter_config: Optional[str] = None
     tokenizer_path: Optional[str] = None
     metadata: Optional[str] = None
     use_lora: int = 0
-    fairseq2: bool = False
     preq_mode: Optional[PreqMode] = None
     preq_group_size: int = 32
     preq_embedding_quantize: str = "8,0"
@@ -527,8 +520,6 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
             llm_config.base.params = args.params
         if hasattr(args, "checkpoint"):
             llm_config.base.checkpoint = args.checkpoint
-        if hasattr(args, "checkpoint_dir"):
-            llm_config.base.checkpoint_dir = args.checkpoint_dir
         if hasattr(args, "adapter_checkpoint"):
             llm_config.base.adapter_checkpoint = args.adapter_checkpoint
         if hasattr(args, "adapter_config"):
@@ -539,8 +530,6 @@ def from_args(cls, args: argparse.Namespace) -> "LlmConfig":  # noqa: C901
             llm_config.base.metadata = args.metadata
         if hasattr(args, "use_lora"):
             llm_config.base.use_lora = args.use_lora
-        if hasattr(args, "fairseq2"):
-            llm_config.base.fairseq2 = args.fairseq2
 
         # PreqMode settings
         if hasattr(args, "preq_mode") and args.preq_mode: