[Distributed] Enable KV cache

kwen2501 · kwen2501 · commit 07c21c1d8df7 · 2024-09-16T22:00:11.000-07:00
diff --git a/dist_run.py b/dist_run.py
@@ -55,7 +55,6 @@
     "llama2-7b-chat": ("meta-llama/Llama-2-7b-chat-hf", torch.float16),
     "llama3": ("meta-llama/Meta-Llama-3-8B-Instruct", torch.bfloat16),
 }
-CACHE_PRECISION = torch.bfloat16
 
 
 def _init_distributed():
@@ -245,8 +244,8 @@ def main(args):
 
     tokenizer = _build_chat_tokenizer(model_name)
 
-    set_precision(CACHE_PRECISION)
-    logger.info(f"Using cache precision {CACHE_PRECISION}")
+    set_precision(model_dtype)
+    logger.info(f"Using cache precision {model_dtype}")
 
     hf_config = get_hf_config_file(distribution)
     if hf_config is None:
@@ -285,8 +284,6 @@ def main(args):
     with device:
         model = Transformer(config)
 
-    model.setup_caches(1, 4096)
-
     # Distribute model on TP mesh
     model.distribute(tp_mesh)
     if rank == 0:
@@ -300,6 +297,12 @@ def main(args):
     dim = 4096  # embedding dimension
     assert seqlen % sp_degree == 0
 
+    # Setup KV caches (after model distribution)
+    # TODO: the setting below only works for 1 micro-batch case. To support
+    # multiple micro-batches, we need the KV cache in the model to be aware of
+    # the number of micro-batches and the current micro-batch index.
+    model.setup_caches(mb_size, seqlen)
+
     mb_ids = torch.randint(0, config.vocab_size, (mb_size, seqlen), device=device)
     activation = torch.rand(
         mb_size, seqlen // sp_degree, dim, device=device, dtype=model_dtype
diff --git a/torchchat/model.py b/torchchat/model.py
@@ -437,13 +437,15 @@ def setup_caches(self, max_batch_size, max_seq_length):
             and self.max_batch_size >= max_batch_size
         ):
             return
-        head_dim = self.config.dim // self.config.n_heads
         max_seq_length = find_multiple(max_seq_length, 8)
         self.max_seq_length = max_seq_length
         self.max_batch_size = max_batch_size
         for b in self.layers.values():
-            b.attention.kv_cache = KVCache(
-                max_batch_size, max_seq_length, self.config.n_local_heads, head_dim
+            # Lower the setup_cache call to the attention module because tensor
+            # parallelism may have been applied there and the `n_local_heads``
+            # value being adjusted.
+            b.attention.setup_cache(
+                max_batch_size, max_seq_length,
             )
 
         freqs_cis = precompute_freqs_cis(
@@ -559,6 +561,16 @@ def __init__(self, config: TransformerArgs):
         self.dim = config.dim
         self._register_load_state_dict_pre_hook(self.load_hook)
 
+    def setup_cache(self, max_batch_size, max_seq_length):
+        n_local_heads = self.n_local_heads
+        # If TP is enabled, the heads would be divided and assigned to different ranks
+        if hasattr(self, "tp_degree"):
+            n_local_heads = self.n_local_heads // self.tp_degree
+
+        self.kv_cache = KVCache(
+            max_batch_size, max_seq_length, n_local_heads, self.head_dim
+        )
+
     def load_hook(self, state_dict, prefix, *args):
         # if prefix + "wq.weight" in state_dict:
         #     wq = state_dict.pop(prefix + "wq.weight")
@@ -599,14 +611,13 @@ def _unfuse_wqkv_state_dict(
 
     def distribute(self, device_mesh: DeviceMesh):
         self.device_mesh = device_mesh
+        self.tp_degree = device_mesh.size()
         parallelize_module(self.wq, device_mesh, ColwiseParallel())
         parallelize_module(self.wk, device_mesh, ColwiseParallel())
         parallelize_module(self.wv, device_mesh, ColwiseParallel())
         parallelize_module(
             self.wo, device_mesh, RowwiseParallel(output_layouts=Shard(1))
         )
-        # TODO: enable kv cache in distributed case
-        self.kv_cache = None
 
     def forward(
         self,