update prefill functions

lessw2020 · lessw2020 · commit 13bdcb337665 · 2024-09-11T14:18:55.000-07:00
diff --git a/dist_run.py b/dist_run.py
@@ -7,7 +7,7 @@
 import os
 from pathlib import Path
 from types import SimpleNamespace
-from typing import Any, Dict
+from typing import Any, Dict, Tuple
 
 # Run command:
 # torchrun --nproc-per-node 4 dist_run.py
@@ -124,19 +124,19 @@ def _load_model_weights(stage_module, hf_model_name, device, model_config):
 
 def _encode_string(string, tokenizer, bos=True, device="cuda", dtype=torch.int64)-> torch.Tensor:
     """Encode a prompt string into a tensor of token ids."""
-        tokens = tokenizer.encode(string)
-        if bos:
-            tokens = [tokenizer.bos_id()] + tokens
-        return torch.tensor(tokens, dtype=dtype, device=device)
-
-def _create_padded_prompt(input_ids, seqlen, start_pos, device) -> torch.Tensor:
-    """Create a padded tensor for the encoded input prompt."""
-        prompt_len = input_ids.size(0)
-        max_new_tokens = min(seqlen, seqlen - start_pos - prompt_len)
-        token_buffer_size = prompt_len + max_new_tokens
-        seq = torch.full((1, token_buffer_size), tokenizer.eos_id(), dtype=torch.int64, device=device)
-        seq[0, :prompt_len] = input_ids
-        return seq
+    tokens = tokenizer.encode(string)
+    if bos:
+        tokens = [tokenizer.bos_id()] + tokens
+    return torch.tensor(tokens, dtype=dtype, device=device)
+
+def _create_padded_prompt(input_ids, tokenizer, seqlen, start_pos, device) -> Tuple[torch.Tensor, int]:
+    """Create a padded tensor for the encoded input prompt. Returns the padded tensor and the prompt length."""
+    prompt_len = input_ids.size(0)
+    max_new_tokens = min(seqlen, seqlen - start_pos - prompt_len)
+    token_buffer_size = prompt_len + max_new_tokens
+    seq = torch.full((1, token_buffer_size), tokenizer.eos_id(), dtype=torch.int64, device=device)
+    seq[0, :prompt_len] = input_ids
+    return seq, prompt_len
 
 def _cleanup():
     dist.barrier()
@@ -251,32 +251,30 @@ def main():
     if len(cpu_tensors) > 0:
         raise ValueError("Found cpu tensors in stage")
 
-    
-    prompt = "What is the capital of France?"
+    prompt = "What is snow?"
     start_pos = 0
 
     # encode the prompt
     input_ids = _encode_string(prompt, tokenizer, bos=True, device=device, dtype=torch.int64)
-
-    # create a padded tensor for the input prompt
-    seq = _create_padded_prompt(input_ids, seqlen, start_pos, device)
+    logger.info(f"{input_ids[0:8]=}")
     
+    # create a padded tensor for the input prompt
+    padded_sequence, prompt_len = _create_padded_prompt(input_ids, tokenizer, seqlen, start_pos, device)
+    logger.info(f"{prompt_len=}")
 
     schedule = ScheduleGPipe(stage, mbs)
     logger.info(f"Created schedule: {schedule}")
 
     with torch.no_grad():  # .inference_mode():
         if pp_rank == 0:
-            schedule.step(seq)
+            schedule.step(padded_sequence)
         else:
             output = schedule.step()
 
     # Decoding
     if pp_rank == pp_degree - 1 and tp_rank == 0:
-        
         next_token_logits = output[:,prompt_len-1, :]
         next_token = torch.argmax(next_token_logits, dim=-1)
-
         next_token_decoded = tokenizer.decode((next_token.tolist()))
 
         logger.info(f"\n\n{color.green} Prefill response ====>>>> {color.blue} {next_token_decoded=}, {next_token}\n{color.reset}")