pytorch
diff --git a/‎dist_run.py‎
Lines changed: 66 additions & 29 deletions b/‎dist_run.py‎
Lines changed: 66 additions & 29 deletions
diff --git a/‎install/install_requirements.sh‎
Lines changed: 3 additions & 0 deletions b/‎install/install_requirements.sh‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎torchchat/cli/builder.py‎
Lines changed: 11 additions & 5 deletions b/‎torchchat/cli/builder.py‎
Lines changed: 11 additions & 5 deletions
diff --git a/‎torchchat/cli/convert_hf_checkpoint.py‎
Lines changed: 17 additions & 8 deletions b/‎torchchat/cli/convert_hf_checkpoint.py‎
Lines changed: 17 additions & 8 deletions
@@ -10,6 +10,7 @@
 
 import argparse
 import os
+from enum import auto, Enum
 from pathlib import Path
 from types import SimpleNamespace
 from typing import Any, Dict, List, Optional, Tuple
@@ -22,10 +23,10 @@
 from torchchat.distributed.logging_utils import SingletonLogger
 
 # TODO - these are not distributed specific, consider moving to new package
-from torchchat.distributed.safetensor_utils import (
+from torchchat.distributed.checkpoint_utils import (
     get_hf_config_file,
-    get_hf_weight_map_and_path,
-    load_safetensor_weights,
+    load_weights_from_hf_format,
+    load_weights_from_torchchat_format,
 )
 from torchchat.distributed.utils import (
     bytes_to_readable,
@@ -49,6 +50,7 @@
 
 
 logger = SingletonLogger.get_logger()
+_tokenizer_type = None  # global variable to store the tokenizer type
 
 # Using model name to identify the model to load, for example "llama2-7b-chat".
 # You can change it to other values listed below.
@@ -59,6 +61,11 @@
 }
 
 
+class TokenizerType(Enum):
+    Tiktoken = auto()
+    SentencePiece = auto()
+
+
 def _init_distributed():
     dist.init_process_group("nccl")
     rank = dist.get_rank()
@@ -80,7 +87,10 @@ def _build_chat_tokenizer(
     model_name: str,
     model_base_name: Optional[str] = None,
 ) -> SentencePieceProcessor | TiktokenTokenizer:
-    """Builds a tokenizer for the given model name."""
+    """Builds a tokenizer for the given model name, and sets the global tokenizer type variable"""
+
+    global _tokenizer_type
+
     # Try to infer the model base name from the model name:
     # e.g. "llama2-7b-chat" -> "llama2"
     if model_base_name is None:
@@ -107,29 +117,45 @@ def _build_chat_tokenizer(
     logger.info(
         f"using tokenizer = {tokenizer.__class__.__module__}.{tokenizer.__class__.__name__}"
     )
+    # set global variable _tokenizer_type
+    if isinstance(tokenizer, TiktokenTokenizer):
+        _tokenizer_type = TokenizerType.Tiktoken
+    elif isinstance(tokenizer, SentencePieceProcessor):
+        _tokenizer_type = TokenizerType.SentencePiece
+    else:
+        raise ValueError(f"Unknown tokenizer type: {tokenizer.__class__}")
+
+    logger.info(f"tokenizer type = {_tokenizer_type}")
     return tokenizer
 
 
-def _load_model_weights(stage_module, distribution, device, model_config):
+def _load_model_weights(
+    stage_module: torch.nn.Module,
+    distribution: str,
+    device: torch.device,
+    model_config: ModelArgs,
+    chpt_from: str,
+):
     """Load the weights from the safetensor file(s) into the model stage.
     Model config is needed b/c we permute wq and wk weights based on attn heads.
-    """
 
-    weight_map, weight_path, key_map = get_hf_weight_map_and_path(distribution)
-
-    num_loaded_weights, num_missing_weights = load_safetensor_weights(
-        stage_module,
-        weight_map,
-        weight_path,
-        key_map,
-        device,
-        model_config=model_config,
-    )
-    logger.info(
-        f"Success - Loaded {num_loaded_weights} weights, {num_missing_weights} missing weights"
-    )
-    if num_missing_weights > 0:
-        raise ValueError(f"Missing {num_missing_weights} weights")
+    Args:
+        stage_module (torch.nn.Module): The model stage to load the weights into.
+        distribution (str): The distribution name, e.g. "meta-llama/Meta-Llama-3-8B-Instruct".
+        device (torch.device): The device to load the weights onto.
+        model_config (ModelArgs): The model config.
+        chpt_from (str): The checkpoint format to load the weights from, e.g. "torchchat" or "hf".
+    """
+    if chpt_from == "hf":
+        # This format stands for: index file + multiple binary files
+        load_weights_from_hf_format(stage_module, distribution, device, model_config)
+    elif chpt_from == "torchchat":
+        # This format stands for:
+        # single binary file, OR
+        # multiple binary files without index files.
+        load_weights_from_torchchat_format(stage_module, distribution, device, model_config)
+    else:
+        raise ValueError(f"Unknown checkpoint format: {chpt_from}")
 
 
 def _encode_strings(
@@ -269,6 +295,7 @@ def _cleanup():
 
 prompt = [
     "What is Snow?",
+    # "Can you explain what is the purpose of back propagation in neural networks?",
     "Who is Santa Claus?",
     "Where does Santa live?",
     # "Who is Abraham Lincoln?",
@@ -286,7 +313,7 @@ def main(args):
     logger.info(f"{color.yellow} {gpu_memory_monitor.get_device_info()}{color.reset}")
 
     distribution, model_dtype = NAME_TO_DISTRIBUTION_AND_DTYPE[model_name]
-    logger.info(f"Using HF model weights from {distribution} and dtype {model_dtype}")
+    logger.info(f"Using model weights from {distribution} and dtype {model_dtype}")
 
     # Model-level config
     model_config = ModelArgs.from_name(distribution)
@@ -348,7 +375,7 @@ def main(args):
     # Load weights
     logger.info(f"Loading weights for {pp_rank=} on {device=}")
     with CUDATrackTime() as timer:
-        _load_model_weights(model, distribution, device=device, model_config=config)
+        _load_model_weights(model, distribution, device, config, args.chpt_from)
 
     logger.info(
         f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
@@ -487,7 +514,7 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
         group=pp_group,
     )
     # create schedule
-    decorder = ScheduleGPipe(decode_stage, 1)
+    decoder = ScheduleGPipe(decode_stage, 1)
 
     # Decoding
     with torch.no_grad(), CUDATrackTime() as timer:
@@ -510,11 +537,11 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
 
             # Run data through pipeline
             if pp_rank == first_pp_rank:
-                output = decorder.step(new_token, **kwargs)
+                output = decoder.step(new_token, **kwargs)
             elif pp_rank == last_pp_rank:
-                output = decorder.step(**kwargs)
+                output = decoder.step(**kwargs)
             else:  # middle pp ranks
-                decorder.step(**kwargs)
+                decoder.step(**kwargs)
 
             # Decode the output
             if pp_rank == last_pp_rank:
@@ -539,13 +566,16 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
         # token ids. Thus cat'ing along dim 1.
         res = torch.cat(res, dim=1)
         res_list = res.tolist()
-        if isinstance(tokenizer, TiktokenTokenizer):
+        if _tokenizer_type == TokenizerType.Tiktoken:
             # For TiktokenTokenizer, we need to decode prompt by prompt.
             # TODO: is there a better way to do this?
             responses = [tokenizer.decode(sequence) for sequence in res_list]
-        else:  # SentencePieceProcessor
+        elif _tokenizer_type == TokenizerType.SentencePiece:  # SentencePieceProcessor
             # For SentencePieceProcessor, we can decode the entire 2D list at once.
             responses = tokenizer.decode(res_list)
+        else:
+            raise ValueError(f"Unknown tokenizer type {_tokenizer_type}")
+
         # Show prompts and responses
         for prompt_text, response_text in zip(prompt, responses):
             logger.info(f"Prompt: {color.green}{prompt_text} {color.reset}")
@@ -579,6 +609,13 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
         default=False,
         help="Whether to decode token into string in flight",
     )
+    parser.add_argument(
+        "--chpt-from",
+        type=str,
+        default="hf",  # TODO: change to torchchat once we support it well
+        help="Checkpoint format to load from",
+        choices=["hf", "torchchat"],
+    )
     args = parser.parse_args()
 
     main(args)
@@ -67,6 +67,9 @@ TUNE_NIGHTLY_VERSION=dev20240928
 if [[ -x "$(command -v nvidia-smi)" ]];
 then
   TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cu121"
+elif [[ -x "$(command -v rocminfo)" ]];
+then
+  TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/rocm6.2"
 else
   TORCH_NIGHTLY_URL="https://download.pytorch.org/whl/nightly/cpu"
 fi
 
@@ -335,11 +335,7 @@ def _load_model_gguf(builder_args: BuilderArgs) -> Model:
     return model
 
 
-def _load_model_default(builder_args: BuilderArgs) -> Model:
-    assert not builder_args.gguf_path
-
-    model: Model = _init_model_on_meta_device(builder_args)
-
+def _load_checkpoint(builder_args: BuilderArgs):
     if builder_args.params_table and builder_args.params_table.endswith("Tune"):
         print("Loading Tune checkpoint")
         meta_checkpoint = torch.load(
@@ -377,6 +373,16 @@ def _load_model_default(builder_args: BuilderArgs) -> Model:
             mmap=True,
             weights_only=True,
         )
+    return checkpoint
+
+
+def _load_model_default(builder_args: BuilderArgs) -> Model:
+    assert not builder_args.gguf_path
+
+    model: Model = _init_model_on_meta_device(builder_args)
+
+    # Load checkpoint from filesystem
+    checkpoint = _load_checkpoint(builder_args)
 
     if "model" in checkpoint and "stories" in str(builder_args.checkpoint_path):
         checkpoint = checkpoint["model"]
 
@@ -81,10 +81,17 @@ def convert_hf_checkpoint(
         "model.layers.{}.self_attn.k_proj.weight": "layers.{}.attention.wk.weight",
         "model.layers.{}.self_attn.v_proj.weight": "layers.{}.attention.wv.weight",
         "model.layers.{}.self_attn.o_proj.weight": "layers.{}.attention.wo.weight",
+        "model.layers.{}.self_attn.q_proj.bias": "layers.{}.attention.wq.bias",
+        "model.layers.{}.self_attn.k_proj.bias": "layers.{}.attention.wk.bias",
+        "model.layers.{}.self_attn.v_proj.bias": "layers.{}.attention.wv.bias",
+        "model.layers.{}.self_attn.o_proj.bias": "layers.{}.attention.wo.bias",
         "model.layers.{}.self_attn.rotary_emb.inv_freq": None,
         "model.layers.{}.mlp.gate_proj.weight": "layers.{}.feed_forward.w1.weight",
         "model.layers.{}.mlp.up_proj.weight": "layers.{}.feed_forward.w3.weight",
         "model.layers.{}.mlp.down_proj.weight": "layers.{}.feed_forward.w2.weight",
+        "model.layers.{}.mlp.gate_proj.bias": "layers.{}.feed_forward.w1.bias",
+        "model.layers.{}.mlp.up_proj.bias": "layers.{}.feed_forward.w3.bias",
+        "model.layers.{}.mlp.down_proj.bias": "layers.{}.feed_forward.w2.bias",
         "model.layers.{}.input_layernorm.weight": "layers.{}.attention_norm.weight",
         "model.layers.{}.post_attention_layernorm.weight": "layers.{}.ffn_norm.weight",
         "model.norm.weight": "norm.weight",
@@ -93,11 +100,10 @@ def convert_hf_checkpoint(
     bin_files = {model_dir / bin for bin in bin_index["weight_map"].values()}
 
     def permute(w, n_heads):
-        dim = config.dim
         return (
-            w.view(n_heads, 2, config.head_dim // 2, dim)
+            w.view(n_heads, 2, config.head_dim // 2, *w.shape[1:])
             .transpose(1, 2)
-            .reshape(config.head_dim * n_heads, dim)
+            .reshape(w.shape)
         )
 
     merged_result = {}
@@ -130,6 +136,7 @@ def load_safetensors():
                 continue
         assert state_dict is not None, f"Unable to load tensors from {file}"
         merged_result.update(state_dict)
+
     final_result = {}
     for key, value in merged_result.items():
         if "layers" in key:
@@ -145,16 +152,18 @@ def load_safetensors():
         final_result[new_key] = value
 
     for key in tuple(final_result.keys()):
-        if "wq" in key:
+        if "wq.weight" in key or "wq.bias" in key:
+            wk_key = key.replace("wq", "wk")
+            wv_key = key.replace("wq", "wv")
             q = final_result[key]
-            k = final_result[key.replace("wq", "wk")]
-            v = final_result[key.replace("wq", "wv")]
+            k = final_result[wk_key]
+            v = final_result[wv_key]
             q = permute(q, config.n_heads)
             k = permute(k, config.n_local_heads)
             final_result[key.replace("wq", "wqkv")] = torch.cat([q, k, v])
             del final_result[key]
-            del final_result[key.replace("wq", "wk")]
-            del final_result[key.replace("wq", "wv")]
+            del final_result[wk_key]
+            del final_result[wv_key]
     print(f"Saving checkpoint to {model_dir / 'model.pth'}. This may take a while.")
     torch.save(final_result, model_dir / "model.pth")
     print("Done.")