Remove tensor parallel in generation

tridao · tridao · commit dcc9309b28c7 · 2023-12-19T21:40:33.000-08:00
diff --git a/mamba_ssm/utils/generation.py b/mamba_ssm/utils/generation.py
@@ -100,7 +100,6 @@ def decode(
     eos_token_id=None,
     teacher_outputs=None,
     vocab_size=None,
-    tensor_parallel=1,
     cg=False,
     enable_timing=False,
     streamer: Optional[TextStreamer] = None
@@ -134,7 +133,6 @@ def decode(
             batch_size,
             seqlen_og,
             max_length,
-            tensor_parallel=tensor_parallel,
         )
         inference_params = model._decoding_cache.inference_params
         inference_params.reset(max_length, batch_size)
@@ -186,8 +184,6 @@ def should_stop(current_token, inference_params):
     end = torch.cuda.Event(enable_timing=enable_timing)
 
     if enable_timing:
-        if tensor_parallel > 1:
-            torch.distributed.barrier()
         start.record()
     scores, sequences = [], [input_ids]
     while not should_stop(sequences[-1], inference_params):
@@ -201,8 +197,6 @@ def should_stop(current_token, inference_params):
         streamer.end()
     if enable_timing:
         end.record()
-        if tensor_parallel > 1:
-            torch.distributed.barrier()
         torch.cuda.synchronize()
         print(f"Prompt processing + decoding time: {(start.elapsed_time(end)):.0f}ms")
     output_cls = GreedySearchDecoderOnlyOutput if top_k == 1 else SampleDecoderOnlyOutput
@@ -232,22 +226,6 @@ def generate(
         return output if return_dict_in_generate else output.sequences
 
 
-def allocate_inference_cache(
-    max_batch_size,
-    max_seqlen,
-    nheads,
-    headdim,
-    layers: Union[int, Sequence],
-    device,
-    dtype=torch.float16,
-):
-    assert dtype in [torch.float16, torch.bfloat16, torch.float32]
-    kv_cache_shape = (max_batch_size, max_seqlen, 2, nheads, headdim)
-    if isinstance(layers, int):
-        layers = range(layers)
-    return {i: torch.empty(kv_cache_shape, device=device, dtype=dtype) for i in layers}
-
-
 @dataclass
 class DecodingCGCache:
     max_batch_size: int = 0
@@ -268,7 +246,6 @@ def update_graph_cache(
     seqlen_og,
     max_seqlen,
     decoding_seqlens=(1,),
-    tensor_parallel=1,
     dtype=None,
     n_warmups=2,
 ):
@@ -289,23 +266,8 @@ def update_graph_cache(
         gc.collect()
         cache.device, cache.dtype = device, dtype
         cache.max_batch_size, cache.max_seqlen = batch_size, max_seqlen
-        if hasattr(model, "allocate_inference_cache"):
-            inf_cache = model.allocate_inference_cache(batch_size, max_seqlen, dtype)
-        else:
-            headdim = getattr(
-                model.config,
-                "head_dim",
-                model.config.hidden_size // model.config.num_attention_heads,
-            )
-            inf_cache = allocate_inference_cache(
-                batch_size,
-                max_seqlen,
-                model.config.num_attention_heads // tensor_parallel,
-                headdim,
-                model.config.num_hidden_layers,
-                device,
-                dtype,
-            )
+        assert hasattr(model, "allocate_inference_cache"), "CUDA graph decoding requires that the model has a method allocate_inference_cache"
+        inf_cache = model.allocate_inference_cache(batch_size, max_seqlen, dtype)
         lengths_per_sample = torch.full((batch_size,), seqlen_og, dtype=torch.int32, device=device)
         cache.inference_params = InferenceParams(
             max_seqlen=max_seqlen,