Merge branch 'main' into multiturn-mm-single-image

Jack-Khuu · web-flow · commit 61d1e0eefcce · 2024-10-03T15:48:28.000-07:00
diff --git a/.lintrunner.toml b/.lintrunner.toml
@@ -19,7 +19,7 @@ init_command = [
     'run',
     'pip_init',
     '--dry-run={{DRYRUN}}',
-    '--requirement=requirements-lintrunner.txt',
+    '--requirement=install/requirements-lintrunner.txt',
 ]
 
 # Black + usort
@@ -46,7 +46,7 @@ init_command = [
     'pip_init',
     '--dry-run={{DRYRUN}}',
     '--no-black-binary',
-    '--requirement=requirements-lintrunner.txt',
+    '--requirement=install/requirements-lintrunner.txt',
 ]
 is_formatter = true
 
@@ -75,6 +75,6 @@ init_command = [
     'run',
     'pip_init',
     '--dry-run={{DRYRUN}}',
-    '--requirement=requirements-lintrunner.txt',
+    '--requirement=install/requirements-lintrunner.txt',
 ]
 is_formatter = true
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -10,9 +10,23 @@ We actively welcome your pull requests.
 2. If you've added code that should be tested, add tests.
 3. If you've changed APIs, update the documentation.
 4. Ensure the test suite passes.
-5. Make sure your code lints.
+5. Make sure your code is well-formatted using the repo linter. See "Linting" for details.
 6. If you haven't already, complete the Contributor License Agreement ("CLA").
 
+
+### Linting
+Install the lintrunner dependencies from the requirements file.
+```
+pip3 install -r install/requirements-lintrunner.txt
+```
+
+After making your changes locally, run the lintrunner and apply all suggestions to your changes.
+You can do this from the top-level torchchat directory - it will apply suggestions only to files that
+you have touched.
+```
+lintrunner -a
+```
+
 ## Contributor License Agreement ("CLA")
 In order to accept your pull request, we need you to submit a CLA. You only need
 to do this once to work on any of Meta's open source projects.
diff --git a/dist_run.py b/dist_run.py
@@ -209,6 +209,7 @@ def _batch_decode_next_tokens(
     batch_size, seq_len, vocab_size = output.shape
 
     if step != -1:
+        # `pos` is not provided, so we can use the first token
         next_token_logits = output[:, 0, :]
     else:
         # get the logits for each prompt at the specified positions
@@ -228,9 +229,9 @@ def _batch_decode_next_tokens(
         ).squeeze(-1)
     else:
         # Argmax (deterministic)
-        next_tokens = torch.argmax(next_token_logits, dim=-1)
+        next_tokens = torch.argmax(next_token_logits, dim=-1, keepdim=True)
 
-    logger.info(f"{color.yellow}Next tokens: {color.blue}{next_tokens}{color.reset}")
+    # Token ids in int tensor form
     return next_tokens
 
 
@@ -247,6 +248,11 @@ def _update_padded_sequence(
 # Decode token id into string and print it
 def _decode_in_flight(token, tokenizer, tp_rank):
     """decode token ids for all prompts in the batch and log them"""
+    # `token` is a tensor of shape (batch_size, 1).
+    # For TiktokenTokenizer, we need to squeeze it to 1D.
+    # For SentencePieceProcessor, we don't.
+    if isinstance(tokenizer, TiktokenTokenizer):
+        token = torch.squeeze(token, dim=1)
     token_str = tokenizer.decode(token.tolist())
     # print the token string on tp rank 0
     if tp_rank == 0:
@@ -328,15 +334,26 @@ def main(args):
     config.stage_idx = pp_rank
     config.n_stages = pp_degree
 
-    with device:
+    with torch.device("meta"):
         # TODO: we should create model instead of Transformer
         model = Transformer(config)
 
     # Distribute model on TP mesh
+    # (Surprisingly, this works even though model is on meta device and mesh is of
+    # cuda devices)
     model.distribute(tp_mesh)
     if rank == 0:
         logger.info(f"Model: {model}")
 
+    # Load weights
+    logger.info(f"Loading weights for {pp_rank=} on {device=}")
+    with CUDATrackTime() as timer:
+        _load_model_weights(model, distribution, device=device, model_config=config)
+
+    logger.info(
+        f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
+    )
+
     # Batch size. Since we push batches dynamically through the pipeline rather
     # than chunking them, this is effectively micro-batch size in pipeline
     # sense. Thus it is interchangeable with micro-batch size below.
@@ -352,17 +369,8 @@ def main(args):
     # lanes.
     # TODO: bump up the lane count
     pipeline_lanes = 1
-    model.setup_caches(batch_size, seqlen_prefill, cache_lanes=pipeline_lanes)
-
-    # Load weights
-    logger.info(f"Loading weights for {pp_rank=} on {device=}")
-    with CUDATrackTime() as timer:
-        _load_model_weights(model, distribution, device=device, model_config=config)
-        model.to(device)
-
-    logger.info(
-        f"{color.green}Total weight loading time: {timer.get_time()} {timer.unit} for rank {rank}{color.reset}"
-    )
+    with device:
+        model.setup_caches(batch_size, seqlen_prefill, cache_lanes=pipeline_lanes)
 
     # info on stage size and params
     stage_size = get_module_size(model)
@@ -528,14 +536,12 @@ def get_example_ins_outs(seqlen: int) -> Tuple[torch.Tensor, torch.Tensor]:
 
     # output formatted response via last pp group and tp rank 0
     if pp_rank == last_pp_rank and tp_rank == 0:
-        # `res` is a list of tensors, each being a batch of generated token ids
-
-        res_stacked = torch.stack(res, dim=1)
-        res_list = res_stacked.tolist()
-
-        # Decode the output as comprehension instead of loop
-        responses = [tokenizer.decode(sequence) for sequence in res_list]
-
+        # `res` is a list of tensors, each being a batch of generated token ids.
+        # We need to concatenate them to get the full sequence of generated
+        # token ids. Thus cat'ing along dim 1.
+        res = torch.cat(res, dim=1)
+        res_list = res.tolist()
+        responses = tokenizer.decode(res_list)
         # Show prompts and responses
         for prompt_text, response_text in zip(prompt, responses):
             logger.info(f"Prompt: {color.green}{prompt_text} {color.reset}")
diff --git a/docs/multimodal.md b/docs/multimodal.md
@@ -19,9 +19,6 @@ While we strongly encourage you to use the Hugging Face checkpoint (which is the
 ```
 
 ##  Generation
-
-**We are currently debugging Multimodal Inference on MPS and will have updates soon. In the meantime, when testing on Mac, please set `--device cpu`**
-
 This generates text output based on a text prompt and (optional) image prompt.
 
 ```
diff --git a/torchchat/distributed/dtensor_utils.py b/torchchat/distributed/dtensor_utils.py
@@ -8,24 +8,17 @@
 logger = SingletonLogger.get_logger()
 
 
-
-def is_dtensor(tensor):
-    """Check if a tensor is a DTensor by class or has a placements attribute (not sure if we want to use attr check)"""
-    return isinstance(tensor, DTensor) or hasattr(tensor, "placements")
-
-
-def load_into_dtensor(weight_tensor, model_dtensor):
+def convert_to_dtensor(weight_tensor, dtensor_template):
     """Adjust a loaded tensor to match the shape/placement of the model DTensor and copy the data into it"""
-    weight_tensor = weight_tensor.to(model_dtensor.device)
 
-    if weight_tensor.shape != model_dtensor.shape:
+    if weight_tensor.shape != dtensor_template.shape:
         raise ValueError(
             f"Shape mismatch: weight tensor shape {weight_tensor.shape} "
-            f"doesn't match DTensor shape {model_dtensor.shape}"
+            f"doesn't match DTensor shape {dtensor_template.shape}"
         )
 
-    placements = model_dtensor.placements
-    mesh = model_dtensor.device_mesh
+    placements = dtensor_template.placements
+    mesh = dtensor_template.device_mesh
     mesh_dims = mesh.ndim
 
     for placement in placements:
diff --git a/torchchat/distributed/safetensor_utils.py b/torchchat/distributed/safetensor_utils.py
@@ -13,8 +13,8 @@
 from torch.nn import Module
 from typing import Dict, Tuple, Set, Optional
 
-
-from torchchat.distributed.dtensor_utils import is_dtensor, load_into_dtensor
+from torch.distributed._tensor import DTensor
+from torchchat.distributed.dtensor_utils import convert_to_dtensor
 
 
 _DEFAULT_SAFETENSOR_FILE_NAME = "model.safetensors.index.json"
@@ -284,9 +284,7 @@ def update_state_dict(
                 continue
 
             checkpoint_tensor = checkpoint[old_param]
-            stage_tensor = state_dict[param]
-
-            stage_is_dtensor = is_dtensor(stage_tensor)
+            model_tensor = state_dict[param]
 
             if "wq" in param:
                 checkpoint_tensor = permute_weight_to_attn_heads(
@@ -297,17 +295,16 @@ def update_state_dict(
                     checkpoint_tensor, num_local_heads, head_dim, dim
                 )
 
+            # Move checkpoint tensor to desired device
+            checkpoint_tensor = checkpoint_tensor.to(device)
+
             # here we need to check if the tensor is a DTensor and if so, adjust the
             # shape and placement to match the model DTensor.
-            if stage_is_dtensor:
-                model_tensor = load_into_dtensor(checkpoint_tensor, stage_tensor)
-                # logger.info(f"DTensor: Loaded {param} into {model_tensor=}")
-                state_dict[param] = model_tensor
+            if isinstance(model_tensor, DTensor):
+                state_dict[param] = convert_to_dtensor(checkpoint_tensor, model_tensor)
                 count_dtensors_loaded += 1
-
             else:
                 # regular tensor, just update directly
-                checkpoint_tensor = checkpoint_tensor.to(device)
                 state_dict[param] = checkpoint_tensor
 
             # ensure matching dtypes
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -264,7 +264,6 @@ def __init__(
                 """
             ))
             # fmt: on
-            # raise RuntimeError("You need to use --is-chat-model to indicate model has chat support.")
         self.system_prompt = generator_args.prompt
         self.tokenizer = _initialize_tokenizer(self.tokenizer_args)
 
@@ -503,7 +502,6 @@ def decode_n_tokens(
                 next_prob.clone() if next_prob is not None else None
             )
 
-        # return new_tokens, new_probs
 
     def model_forward(self, model, x, input_pos):
         return model(x, input_pos)
@@ -603,8 +601,6 @@ def generate(
         is_speculative = draft_model is not None
         device, dtype = prompt.device, prompt.dtype
 
-        # create an empty tensor of the expected final shape and
-        # fill in the current tokens
         if len(prompt.shape) > 1:
             prompt = prompt.squeeze(0)
         prompt_length = prompt.size(0)
@@ -633,11 +629,6 @@ def generate(
             if model.config.model_type == ModelType.Flamingo:
                 model.reset_caches()
 
-        # create an empty tensor of the expected final shape and
-        # fill in the current tokens
-        empty = torch.empty(max_seq_length, dtype=dtype, device=device)
-        empty[:prompt_length] = prompt
-
         input_pos = torch.arange(
             start_pos, prompt_length + start_pos, device=device, dtype=torch.int
         )