merge with main

Gasoonjia · Gasoonjia · commit f3cbd53b2fcb · 2024-09-17T14:02:02.000-07:00
diff --git a/dist_run.py b/dist_run.py
@@ -55,7 +55,6 @@
     "llama2-7b-chat": ("meta-llama/Llama-2-7b-chat-hf", torch.float16),
     "llama3": ("meta-llama/Meta-Llama-3-8B-Instruct", torch.bfloat16),
 }
-CACHE_PRECISION = torch.bfloat16
 
 
 def _init_distributed():
@@ -245,8 +244,8 @@ def main(args):
 
     tokenizer = _build_chat_tokenizer(model_name)
 
-    set_precision(CACHE_PRECISION)
-    logger.info(f"Using cache precision {CACHE_PRECISION}")
+    set_precision(model_dtype)
+    logger.info(f"Using cache precision {model_dtype}")
 
     hf_config = get_hf_config_file(distribution)
     if hf_config is None:
@@ -285,8 +284,6 @@ def main(args):
     with device:
         model = Transformer(config)
 
-    model.setup_caches(1, 4096)
-
     # Distribute model on TP mesh
     model.distribute(tp_mesh)
     if rank == 0:
@@ -300,6 +297,12 @@ def main(args):
     dim = 4096  # embedding dimension
     assert seqlen % sp_degree == 0
 
+    # Setup KV caches (after model distribution)
+    # TODO: the setting below only works for 1 micro-batch case. To support
+    # multiple micro-batches, we need the KV cache in the model to be aware of
+    # the number of micro-batches and the current micro-batch index.
+    model.setup_caches(mb_size, seqlen)
+
     mb_ids = torch.randint(0, config.vocab_size, (mb_size, seqlen), device=device)
     activation = torch.rand(
         mb_size, seqlen // sp_degree, dim, device=device, dtype=model_dtype
diff --git a/install/install_requirements.sh b/install/install_requirements.sh
@@ -53,7 +53,7 @@ PYTORCH_NIGHTLY_VERSION=dev20240814
 VISION_NIGHTLY_VERSION=dev20240814
 
 # Nightly version for torchtune
-TUNE_NIGHTLY_VERSION=dev20240910
+TUNE_NIGHTLY_VERSION=dev20240916
 
 
 # Uninstall triton, as nightly will depend on pytorch-triton, which is one and the same
diff --git a/torchchat/generate.py b/torchchat/generate.py
@@ -727,22 +727,22 @@ def chat(
         if generator_args.image_prompts is not None:
             print("Image prompts", generator_args.image_prompts)
 
+            # Support for just the first image prompt for now
+            images = [Image.open(generator_args.image_prompts[0])]
             messages = [
                 Message(
                     role="user",
                     content=[
-                        {"type": "image"},
+                        {"type": "image", "content": images[0]},
                         {"type": "text", "content": generator_args.prompt},
                     ],
                     eot=True,
                 ),
                 Message(role="assistant", content=""),
             ]
 
-            images = [Image.open(generator_args.image_prompts[0])]
             transform = flamingo_transform(str(self.tokenizer_args.tokenizer_path))
-
-            data = transform({"images": images, "messages": messages}, inference=True)
+            data = transform({"messages": messages}, inference=True)
             batch = padded_collate([data], self.builder_args.device)
             batch.pop("mask")
             encoded = batch["tokens"]
diff --git a/torchchat/model.py b/torchchat/model.py
@@ -623,13 +623,15 @@ def setup_caches(self, max_batch_size, max_seq_length):
             and self.max_batch_size >= max_batch_size
         ):
             return
-        head_dim = self.config.dim // self.config.n_heads
         max_seq_length = find_multiple(max_seq_length, 8)
         self.max_seq_length = max_seq_length
         self.max_batch_size = max_batch_size
         for b in self.layers.values():
-            b.attention.kv_cache = KVCache(
-                max_batch_size, max_seq_length, self.config.n_local_heads, head_dim
+            # Lower the setup_cache call to the attention module because tensor
+            # parallelism may have been applied there and the `n_local_heads``
+            # value being adjusted.
+            b.attention.setup_cache(
+                max_batch_size, max_seq_length,
             )
 
         freqs_cis = precompute_freqs_cis(
@@ -745,6 +747,16 @@ def __init__(self, config: TransformerArgs):
         self.dim = config.dim
         self._register_load_state_dict_pre_hook(self.load_hook)
 
+    def setup_cache(self, max_batch_size, max_seq_length):
+        n_local_heads = self.n_local_heads
+        # If TP is enabled, the heads would be divided and assigned to different ranks
+        if hasattr(self, "tp_degree"):
+            n_local_heads = self.n_local_heads // self.tp_degree
+
+        self.kv_cache = KVCache(
+            max_batch_size, max_seq_length, n_local_heads, self.head_dim
+        )
+
     def load_hook(self, state_dict, prefix, *args):
         # if prefix + "wq.weight" in state_dict:
         #     wq = state_dict.pop(prefix + "wq.weight")
@@ -785,14 +797,13 @@ def _unfuse_wqkv_state_dict(
 
     def distribute(self, device_mesh: DeviceMesh):
         self.device_mesh = device_mesh
+        self.tp_degree = device_mesh.size()
         parallelize_module(self.wq, device_mesh, ColwiseParallel())
         parallelize_module(self.wk, device_mesh, ColwiseParallel())
         parallelize_module(self.wv, device_mesh, ColwiseParallel())
         parallelize_module(
             self.wo, device_mesh, RowwiseParallel(output_layouts=Shard(1))
         )
-        # TODO: enable kv cache in distributed case
-        self.kv_cache = None
 
     def forward(
         self,