generate

dreaming-panda · dreaming-panda · commit ac9aa36c8663 · 2024-12-16T18:29:55.000Z
diff --git a/data/eassy.txt b/data/eassy.txt
diff --git a/examples/generation.py b/examples/generation.py
@@ -11,6 +11,7 @@
 parser.add_argument('--M', type=int, default=8192, help='max length')
 parser.add_argument('--D', type=int, default=1, help='dec length')
 parser.add_argument('--G', type=int, default=256, help='generation length')
+parser.add_argument('--t', type=float, default=0.6, help='temperature')
 parser.add_argument('--K', type=int, default=10, help='K')
 parser.add_argument('--L', type=int, default=150, help='K')
 parser.add_argument('--data', type=str, default="../data/story.txt", help='source data file')
@@ -35,7 +36,7 @@
     input_ids = input_ids.to(DEVICE)
     PREFIX_LEN = input_ids.shape[1]
     position_ids = torch.arange(MAX_LEN, device=DEVICE).unsqueeze(0)
-    generated = llm.generate(input_ids, max_tokens=args.G)
+    generated = llm.generate(input_ids, max_tokens=args.G, verbose=True, temperature=args.t)
     text = tokenizer.decode(generated, skip_special_tokens=True)
     print("\033[32m" + text + "\033[0m")
     
diff --git a/models/llama.py b/models/llama.py
@@ -6,6 +6,7 @@
 from .utils import apply_rotary_pos_emb, layer_norm, topp_temperature_decode
 import flashinfer
 from .attnserver import LSHSparseAttnServer, AttnServer
+import time
 class LLMLayer:
     def __init__(self, layer_idx) -> None:
         
@@ -328,13 +329,16 @@ def generate(self,
         input_ids: torch.LongTensor, 
         max_tokens: int = 128,
         temperature: float = 0.6,
-        topp: float = 0.9):
+        topp: float = 0.9,
+        verbose: bool = False):
         
         generated = []
         prefix_len = input_ids.shape[1]
         position_ids = torch.arange(prefix_len + max_tokens, device=self.device).unsqueeze(0)
         logits = self.prefill(input_ids=input_ids)
-        
+        torch.cuda.synchronize()
+        if verbose:
+            t1 = time.time()
         for k in range(max_tokens):
             if temperature < 0.1:
                 input_ids = logits.argmax(dim=-1)
@@ -344,6 +348,12 @@ def generate(self,
             generated.append(input_ids[0].item())
             if input_ids[0].item() in self.eos_tokens:
                 break
+        if verbose:
+            torch.cuda.synchronize()
+            t2 = time.time()
+            print("\033[94m[INFO] Prefill {} tokens\033[0m".format(prefix_len))
+            print("\033[94m[INFO] Generate {} tokens\033[0m".format(len(generated)))
+            print("\033[94m[INFO] Decoding Latency {:.2f} ms/token\033[0m".format(1000 * (t2 - t1)/len(generated)))
         self.attention_server.clear()
         self.k_cache.zero_()
         self.v_cache.zero_()
diff --git a/models/template.py b/models/template.py
@@ -14,5 +14,6 @@
 
 Templates = {
     'meta-llama2': "[INST] {} [/INST]",
-    'meta-llama3': "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n\n",
+    'meta-llama3': "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|>\n<|start_header_id|>assistant<|end_header_id|>\n",
+    'None': "{}",
 }

Original file line number	Diff line number	Diff line change
`@@ -14,5 +14,6 @@`
`14`	`14`
`15`	`15`	`Templates = {`
`16`	`16`	`'meta-llama2': "[INST] {} [/INST]",`
`17`		`- 'meta-llama3': "<\|begin_of_text\|><\|start_header_id\|>user<\|end_header_id\|>\n\n{}<\|eot_id\|>\n<\|start_header_id\|>assistant<\|end_header_id\|>\n\n",`
	`17`	`+ 'meta-llama3': "<\|begin_of_text\|><\|start_header_id\|>user<\|end_header_id\|>\n\n{}<\|eot_id\|>\n<\|start_header_id\|>assistant<\|end_header_id\|>\n",`
	`18`	`+ 'None': "{}",`
`18`	`19`	`}`