Add load support for torchchat checkpoint

kwen2501 · kwen2501 · commit 0d1e56030d04 · 2024-10-04T12:45:33.000-07:00
diff --git a/dist_run.py b/dist_run.py
@@ -25,6 +25,7 @@
 from torchchat.distributed.safetensor_utils import (
     get_hf_config_file,
     load_weights_from_hf_format,
+    load_weights_from_torchchat_format,
 )
 from torchchat.distributed.utils import (
     bytes_to_readable,
@@ -57,10 +58,6 @@
     "llama3": ("meta-llama/Meta-Llama-3-8B-Instruct", torch.bfloat16),
 }
 
-# This format stands for: index file + multiple safetensor.
-USE_HF_CHECKPOINT_FORMAT = True
-# TODO: add support for single bin format.
-
 
 def _init_distributed():
     dist.init_process_group("nccl")
@@ -113,14 +110,33 @@ def _build_chat_tokenizer(
     return tokenizer
 
 
-def _load_model_weights(stage_module, distribution, device, model_config):
+def _load_model_weights(
+    stage_module: torch.nn.Module,
+    distribution: str,
+    device: torch.device,
+    model_config: ModelArgs,
+    chpt_from: str,
+):
     """Load the weights from the safetensor file(s) into the model stage.
     Model config is needed b/c we permute wq and wk weights based on attn heads.
+
+    Args:
+        stage_module (torch.nn.Module): The model stage to load the weights into.
+        distribution (str): The distribution name, e.g. "meta-llama/Meta-Llama-3-8B-Instruct".
+        device (torch.device): The device to load the weights onto.
+        model_config (ModelArgs): The model config.
+        chpt_from (str): The checkpoint format to load the weights from, e.g. "torchchat" or "hf".
     """
-    if USE_HF_CHECKPOINT_FORMAT:
+    if chpt_from == "hf":
+        # This format stands for: index file + multiple binary files
         load_weights_from_hf_format(stage_module, distribution, device, model_config)
-    else:
+    elif chpt_from == "torchchat":
+        # This format stands for:
+        # single binary file, OR
+        # multiple binary files without index files.
         load_weights_from_torchchat_format(stage_module, distribution, device, model_config)
+    else:
+        raise ValueError(f"Unknown checkpoint format: {chpt_from}")
 
 
 def _encode_strings(
@@ -277,7 +293,7 @@ def main(args):
     logger.info(f"{color.yellow} {gpu_memory_monitor.get_device_info()}{color.reset}")
 
     distribution, model_dtype = NAME_TO_DISTRIBUTION_AND_DTYPE[model_name]
-    logger.info(f"Using HF model weights from {distribution} and dtype {model_dtype}")
+    logger.info(f"Using model weights from {distribution} and dtype {model_dtype}")
 
     # Model-level config
     model_config = ModelArgs.from_name(distribution)
@@ -339,7 +355,7 @@ def main(args):
     # Load weights
     logger.info(f"Loading weights for {pp_rank=} on {device=}")
     with CUDATrackTime() as timer:
-        _load_model_weights(model, distribution, device=device, model_config=config)
+        _load_model_weights(model, distribution, device, config, args.chpt_from)
 
     logger.info(
         f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
@@ -570,6 +586,13 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
         default=False,
         help="Whether to decode token into string in flight",
     )
+    parser.add_argument(
+        "--chpt-from",
+        type=str,
+        default="hf",  # TODO: change to torchchat once we support it well
+        help="Checkpoint format to load from",
+        choices=["hf", "torchchat"],
+    )
     args = parser.parse_args()
 
     main(args)
diff --git a/torchchat/cli/builder.py b/torchchat/cli/builder.py
@@ -335,11 +335,7 @@ def _load_model_gguf(builder_args: BuilderArgs) -> Model:
     return model
 
 
-def _load_model_default(builder_args: BuilderArgs) -> Model:
-    assert not builder_args.gguf_path
-
-    model: Model = _init_model_on_meta_device(builder_args)
-
+def _load_checkpoint(builder_args: BuilderArgs):
     if builder_args.params_table and builder_args.params_table.endswith("Tune"):
         print("Loading Tune checkpoint")
         meta_checkpoint = torch.load(
@@ -377,6 +373,16 @@ def _load_model_default(builder_args: BuilderArgs) -> Model:
             mmap=True,
             weights_only=True,
         )
+    return checkpoint
+
+
+def _load_model_default(builder_args: BuilderArgs) -> Model:
+    assert not builder_args.gguf_path
+
+    model: Model = _init_model_on_meta_device(builder_args)
+
+    # Load checkpoint from filesystem
+    checkpoint = _load_checkpoint(builder_args)
 
     if "model" in checkpoint and "stories" in str(builder_args.checkpoint_path):
         checkpoint = checkpoint["model"]
diff --git a/torchchat/distributed/safetensor_utils.py b/torchchat/distributed/safetensor_utils.py
@@ -12,9 +12,11 @@
 import json
 from torch.nn import Module
 from typing import Any, Dict, Tuple, Set, Optional
+from pathlib import Path
 
 from torch.distributed._tensor import DTensor
 from torchchat.distributed.dtensor_utils import convert_to_dtensor
+from torchchat.cli.builder import BuilderArgs, _load_checkpoint
 
 
 _DEFAULT_SAFETENSOR_FILE_NAME = "model.safetensors.index.json"
@@ -182,10 +184,10 @@ def load_safetensor_weights(
             update_state_dict(
                 stage_state_dict,
                 checkpoint,
-                new_to_old_keymap,
-                updated_states,
                 device,
-                model_config,
+                model_config=model_config,
+                new_to_old_keymap=new_to_old_keymap,
+                updated_states=updated_states,
             )
         except FileNotFoundError:
             logger.error(f"File not found: {full_path}")
@@ -264,24 +266,36 @@ def permute_weight_to_attn_heads(w, n_heads, head_dim, model_dim):
 def update_state_dict(
     state_dict: Dict[str, torch.Tensor],
     checkpoint: Dict[str, torch.Tensor],
-    new_to_old_keymap: Dict[str, str],
-    updated_states: Set[str],
     device: torch.device,
     model_config: Optional[Dict] = None,
+    new_to_old_keymap: Optional[Dict[str, str]] = None,
+    updated_states: Optional[Set[str]]= None,
 ):
+    """
+    Update the state dict with the checkpoint tensors.
+    Note:
+    - For HF format, `new_to_old_keymap` is a mapping from the new key to the old
+    key.
+    - For torchchat format, `new_to_old_keymap` is None (because FQN conversion
+    has been doen by torchchat download script).
+    """
     # for handling attn head permuting
     num_heads = model_config.n_heads
     dim = model_config.dim
     num_local_heads = model_config.n_local_heads
     head_dim = model_config.head_dim
 
     for param in state_dict.keys():
-        # TODO: clean this up together with `purge_fqn_prefix` when we switch
-        # from creating Transformer to creating model
-        model_param = (
-            "output.weight" if param == "output.weight" else f"model.{param}"
-        )
-        old_param = new_to_old_keymap.get(model_param)
+        if new_to_old_keymap is not None:
+            # TODO: clean the following manual prefix together with
+            # `purge_fqn_prefix` when we switch from creating Transformer to
+            # creating model
+            model_param = (
+                "output.weight" if param == "output.weight" else f"model.{param}"
+            )
+            old_param = new_to_old_keymap[model_param]
+        else:
+            old_param = param
 
         if old_param not in checkpoint:
             # Maybe this param is in other files
@@ -309,7 +323,9 @@ def update_state_dict(
 
         # Update model state dict with checkpoint tensor
         state_dict[param] = checkpoint_tensor
-        updated_states.add(param)
+
+        if updated_states is not None:
+            updated_states.add(param)
 
 
 def format_tensor_info(tensor: torch.Tensor) -> str:
@@ -378,3 +394,59 @@ def load_weights_from_hf_format(stage_module, distribution, device, model_config
     )
     if num_missing_weights > 0:
         raise ValueError(f"Missing {num_missing_weights} weights")
+
+
+# HACK: assuming single file for torchchat's converted checkpoints. We should
+# remove this after converging to torchchat's model building process.
+# In particular,
+# builder_args = BuilderArgs.from_args(args)
+# will tell us if there is a single file or a directory.
+TORCHCHCAT_SINGLE_FILE_CHECKPOINT = True
+
+def load_weights_from_torchchat_format(stage_module, distribution, device, model_config):
+    """
+    Load the weights from torchchat format (single binary file), and fill into
+    `stage_module`.  Model config is needed b/c we permute wq and wk weights
+    based on attn heads.
+    """
+    stage_state_dict = stage_module.state_dict()
+    # TODO: clean this up together with `purge_fqn_prefix` when we switch
+    stage_state_dict = purge_fqn_prefix(stage_state_dict, "model.")
+
+    # Load checkpoint from torchchat cache
+    default_cache_dir = Path(
+        os.getenv("TORCHCHAT_MODELDIR", "~/.torchchat/model-cache")
+    ).expanduser()
+    # Distribution is like "meta-llama/Meta-Llama-3-8B-Instruct"
+    # Join it with the default cache dir to get the checkpoint dir
+    checkpoint_dir = default_cache_dir / distribution
+    # Provide path in single-file case, provide dir in multi-file case. See
+    # `_load_checkpoint`.
+    if TORCHCHCAT_SINGLE_FILE_CHECKPOINT:
+        checkpoint_path = checkpoint_dir / "model.pth"
+        checkpoint_dir = None
+    else:
+        checkpoint_path = None
+    # First, construct BuilderArgs
+    args_dict = {
+        "device": device,
+        "checkpoint_dir": checkpoint_dir,
+        "checkpoint_path": checkpoint_path,
+    }
+    builder_args = BuilderArgs(**args_dict)
+    # Then, load the checkpoint using torchchat util
+    checkpoint = _load_checkpoint(builder_args)
+
+    updated_states: Set[str] = set()
+    # This step converts full tensor into DTensor
+    update_state_dict(
+        stage_state_dict,
+        checkpoint,
+        device,
+        model_config=model_config,
+        updated_states=updated_states,
+    )
+
+    # Fill state dict into stage module
+    stage_module.load_state_dict(stage_state_dict, strict=False, assign=True)
+    logger.info(f"Successfully loaded {len(updated_states)} weights into stage module")