Merge branch 'main' into stories_browser_fix

vmpuri · web-flow · commit 8c25a9dc14f9 · 2024-10-05T21:07:12.000-07:00
diff --git a/dist_run.py b/dist_run.py
@@ -10,6 +10,7 @@
 
 import argparse
 import os
+from enum import auto, Enum
 from pathlib import Path
 from types import SimpleNamespace
 from typing import Any, Dict, List, Optional, Tuple
@@ -49,6 +50,7 @@
 
 
 logger = SingletonLogger.get_logger()
+_tokenizer_type = None  # global variable to store the tokenizer type
 
 # Using model name to identify the model to load, for example "llama2-7b-chat".
 # You can change it to other values listed below.
@@ -59,6 +61,11 @@
 }
 
 
+class TokenizerType(Enum):
+    Tiktoken = auto()
+    SentencePiece = auto()
+
+
 def _init_distributed():
     dist.init_process_group("nccl")
     rank = dist.get_rank()
@@ -80,7 +87,10 @@ def _build_chat_tokenizer(
     model_name: str,
     model_base_name: Optional[str] = None,
 ) -> SentencePieceProcessor | TiktokenTokenizer:
-    """Builds a tokenizer for the given model name."""
+    """Builds a tokenizer for the given model name, and sets the global tokenizer type variable"""
+
+    global _tokenizer_type
+
     # Try to infer the model base name from the model name:
     # e.g. "llama2-7b-chat" -> "llama2"
     if model_base_name is None:
@@ -107,6 +117,15 @@ def _build_chat_tokenizer(
     logger.info(
         f"using tokenizer = {tokenizer.__class__.__module__}.{tokenizer.__class__.__name__}"
     )
+    # set global variable _tokenizer_type
+    if isinstance(tokenizer, TiktokenTokenizer):
+        _tokenizer_type = TokenizerType.Tiktoken
+    elif isinstance(tokenizer, SentencePieceProcessor):
+        _tokenizer_type = TokenizerType.SentencePiece
+    else:
+        raise ValueError(f"Unknown tokenizer type: {tokenizer.__class__}")
+
+    logger.info(f"tokenizer type = {_tokenizer_type}")
     return tokenizer
 
 
@@ -269,6 +288,7 @@ def _cleanup():
 
 prompt = [
     "What is Snow?",
+    # "Can you explain what is the purpose of back propagation in neural networks?",
     "Who is Santa Claus?",
     "Where does Santa live?",
     # "Who is Abraham Lincoln?",
@@ -487,7 +507,7 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
         group=pp_group,
     )
     # create schedule
-    decorder = ScheduleGPipe(decode_stage, 1)
+    decoder = ScheduleGPipe(decode_stage, 1)
 
     # Decoding
     with torch.no_grad(), CUDATrackTime() as timer:
@@ -510,11 +530,11 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
 
             # Run data through pipeline
             if pp_rank == first_pp_rank:
-                output = decorder.step(new_token, **kwargs)
+                output = decoder.step(new_token, **kwargs)
             elif pp_rank == last_pp_rank:
-                output = decorder.step(**kwargs)
+                output = decoder.step(**kwargs)
             else:  # middle pp ranks
-                decorder.step(**kwargs)
+                decoder.step(**kwargs)
 
             # Decode the output
             if pp_rank == last_pp_rank:
@@ -539,13 +559,16 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
         # token ids. Thus cat'ing along dim 1.
         res = torch.cat(res, dim=1)
         res_list = res.tolist()
-        if isinstance(tokenizer, TiktokenTokenizer):
+        if _tokenizer_type == TokenizerType.Tiktoken:
             # For TiktokenTokenizer, we need to decode prompt by prompt.
             # TODO: is there a better way to do this?
             responses = [tokenizer.decode(sequence) for sequence in res_list]
-        else:  # SentencePieceProcessor
+        elif _tokenizer_type == TokenizerType.SentencePiece:  # SentencePieceProcessor
             # For SentencePieceProcessor, we can decode the entire 2D list at once.
             responses = tokenizer.decode(res_list)
+        else:
+            raise ValueError(f"Unknown tokenizer type {_tokenizer_type}")
+
         # Show prompts and responses
         for prompt_text, response_text in zip(prompt, responses):
             logger.info(f"Prompt: {color.green}{prompt_text} {color.reset}")
diff --git a/torchchat/cli/convert_hf_checkpoint.py b/torchchat/cli/convert_hf_checkpoint.py
@@ -3,6 +3,7 @@
 
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
+import glob
 import json
 import os
 import re
@@ -41,7 +42,12 @@ def convert_hf_checkpoint(
     print(f"Model config {config.__dict__}")
 
     # Load the json file containing weight mapping
-    model_map_json = model_dir / "pytorch_model.bin.index.json"
+    model_map_json_matches = [Path(m) for m in glob.glob(str(model_dir / "*.index.json"))]
+    assert len(model_map_json_matches) <= 1, "Found multiple weight mapping files"
+    if len(model_map_json_matches):
+        model_map_json = model_map_json_matches[0]
+    else:
+        model_map_json = model_dir / "pytorch_model.bin.index.json"
 
     # If there is no weight mapping, check for a consolidated model and
     # tokenizer we can move. Llama 2 and Mistral have weight mappings, while
@@ -96,9 +102,33 @@ def permute(w, n_heads):
 
     merged_result = {}
     for file in sorted(bin_files):
-        state_dict = torch.load(
+
+        # The state_dict can be loaded from either a torch zip file or
+        # safetensors. We take our best guess from the name and try all
+        # possibilities
+        load_pt_mmap = lambda: torch.load(
             str(file), map_location="cpu", mmap=True, weights_only=True
         )
+        load_pt_no_mmap = lambda: torch.load(
+            str(file), map_location="cpu", mmap=False, weights_only=True
+        )
+        def load_safetensors():
+            import safetensors.torch
+            with open(file, "rb") as handle:
+                return safetensors.torch.load(handle.read())
+        if "safetensors" in str(file):
+            loaders = [load_safetensors, load_pt_mmap, load_pt_no_mmap]
+        else:
+            loaders = [load_pt_mmap, load_pt_no_mmap, load_safetensors]
+
+        state_dict = None
+        for loader in loaders:
+            try:
+                state_dict = loader()
+                break
+            except Exception:
+                continue
+        assert state_dict is not None, f"Unable to load tensors from {file}"
         merged_result.update(state_dict)
     final_result = {}
     for key, value in merged_result.items():
diff --git a/torchchat/cli/download.py b/torchchat/cli/download.py
@@ -22,18 +22,44 @@
 def _download_hf_snapshot(
     model_config: ModelConfig, artifact_dir: Path, hf_token: Optional[str]
 ):
-    from huggingface_hub import snapshot_download
+    from huggingface_hub import model_info, snapshot_download
     from requests.exceptions import HTTPError
 
     # Download and store the HF model artifacts.
     print(f"Downloading {model_config.name} from HuggingFace...", file=sys.stderr)
     try:
+        # Fetch the info about the model's repo
+        model_info = model_info(model_config.distribution_path, token=hf_token)
+        model_fnames = [f.rfilename for f in model_info.siblings]
+
+        # Check the model config for preference between safetensors and pth
+        has_pth = any(f.endswith(".pth") for f in model_fnames)
+        has_safetensors = any(f.endswith(".safetensors") for f in model_fnames)
+
+        # If told to prefer safetensors, ignore pth files
+        if model_config.prefer_safetensors:
+            if not has_safetensors:
+                print(
+                    f"Model {model_config.name} does not have safetensors files, but prefer_safetensors is set to True. Using pth files instead.",
+                    file=sys.stderr,
+                )
+                exit(1)
+            ignore_patterns = "*.pth"
+
+        # If the model has both, prefer pth files over safetensors
+        elif has_pth and has_safetensors:
+            ignore_patterns = "*safetensors*"
+
+        # Otherwise, download everything
+        else:
+            ignore_patterns = None
+
         snapshot_download(
             model_config.distribution_path,
             local_dir=artifact_dir,
             local_dir_use_symlinks=False,
             token=hf_token,
-            ignore_patterns="*safetensors*",
+            ignore_patterns=ignore_patterns,
         )
     except HTTPError as e:
         if e.response.status_code == 401:  # Missing HuggingFace CLI login.
diff --git a/torchchat/model_config/model_config.py b/torchchat/model_config/model_config.py
@@ -46,6 +46,7 @@ class ModelConfig:
     checkpoint_file: str = field(default="model.pth")
     tokenizer_file: str = field(default="tokenizer.model")
     transformer_params_key: str = field(default=None)
+    prefer_safetensors: bool = field(default=False)
 
 
 # Keys are stored in lowercase.