Merge pull request #97 from Narsil/medusa2

ctlllll · web-flow · commit e2a5d20c048a · 2024-04-18T14:04:04.000-04:00
Creating medusa2.
diff --git a/create_data.py b/create_data.py
@@ -11,12 +11,12 @@
 
 client = httpx.AsyncClient(timeout=None)
 
-async def run(conv: Conversation):
+async def run(conv: Conversation, url: str):
     payload = {"model":"tgi", "messages": conv.messages}
     response = await client.post(url, json=payload)
     content = response.json()
     message = content["choices"][0]["message"]
-    message.pop("name")
+    message.pop("name", None)
     conv.add_message(message)
 
 
@@ -34,15 +34,16 @@ def fix_source(source):
     return new_source
 
 
-async def recreate_conversation(conversation, sem):
+async def recreate_conversation(conversation, sem, url):
     async with sem:
         conv = Conversation()
         try:
             for message in conversation[::2]:
                 assert message["role"] == "user"
                 conv.add_message(message)
-                await run(conv)
-        except Exception:
+                await run(conv, url)
+        except Exception as e:
+            print(e)
             pass
         return conv.messages
 
@@ -62,7 +63,7 @@ async def _main():
 
         futures = []
         for conversation in conversations:
-            future = recreate_conversation(conversation, sem)
+            future = recreate_conversation(conversation, sem, url)
             futures.append(future)
 
         recreated_conversations = await tqdm.asyncio.tqdm.gather(*futures)
diff --git a/medusa/model/medusa_model_legacy.py b/medusa/model/medusa_model_legacy.py
@@ -25,12 +25,14 @@ def __init__(
         self,
         medusa_num_heads=4,
         medusa_num_layers=1,
+        version="2",
         base_model_name_or_path="lmsys/vicuna-7b-v1.3",
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.medusa_num_heads = medusa_num_heads
         self.medusa_num_layers = medusa_num_layers
+        self.version = version
         self.base_model_name_or_path = base_model_name_or_path
 
 
@@ -101,7 +103,6 @@ def __init__(
             [
                 nn.Sequential(
                     *([ResBlock(self.hidden_size)] * medusa_num_layers),
-                    nn.Linear(self.hidden_size, self.vocab_size, bias=False),
                 )
                 for _ in range(medusa_num_heads)
             ]
@@ -110,13 +111,6 @@ def __init__(
         # Ensure medusa_head's dtype and device align with the base_model
         self.medusa_head.to(self.base_model.dtype).to(self.base_model.device)
 
-        import deepspeed
-        params = [base_model.lm_head.weight]
-        with deepspeed.zero.GatheredParameters(params):
-            for i in range(medusa_num_heads):
-                # Initialize the weights of each medusa_head using the base model's weights
-                self.medusa_head[i][-1].weight.data[:] = base_model.lm_head.weight.data[:]
-
     def get_tokenizer(self):
         """Get the tokenizer of the base model.
 
@@ -207,7 +201,9 @@ def forward(
         medusa_logits = []
         # TODO: Consider parallelizing this loop for efficiency?
         for i in range(self.medusa):
-            medusa_logits.append(self.medusa_head[i](hidden_states))
+            mhidden_states = self.medusa_head[i](hidden_states)
+            mlogits = self.base_model.lm_head(mhidden_states)
+            medusa_logits.append(mlogits)
         if output_orig:
             return torch.stack(medusa_logits, dim=0), outputs, orig
         return torch.stack(medusa_logits, dim=0)
diff --git a/medusa/train/train_legacy.py b/medusa/train/train_legacy.py
@@ -335,6 +335,20 @@ def train():
         config.rope_scaling = {"type": "linear", "factor": scaling_factor}
     config.use_cache = False
 
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=True,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Making sure the tokenizer works before loading the model.
+    print(tokenizer(["This is a test", "secondary"], padding=True))
+    print(tokenizer.apply_chat_template([{"role": "user", "content": "This is a test"}]))
+
     # Load model and tokenizer
     model = transformers.AutoModelForCausalLM.from_pretrained(
         model_args.model_name_or_path,
@@ -358,14 +372,6 @@ def train():
     # Format output dir
     training_args.output_dir = f"{training_args.output_dir}_medusa_mlp_{model_args.model_name_or_path.split('/')[-1]}_medusa_{training_args.medusa_num_heads}_lr_{training_args.learning_rate}_layers_{training_args.medusa_num_layers}"
 
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        model_args.model_name_or_path,
-        cache_dir=training_args.cache_dir,
-        model_max_length=training_args.model_max_length,
-        padding_side="right",
-        use_fast=True,
-    )
-    tokenizer.pad_token = tokenizer.unk_token
 
     # Load data
     data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
@@ -375,6 +381,7 @@ def train():
         medusa_num_heads=training_args.medusa_num_heads,
         medusa_num_layers=training_args.medusa_num_layers,
         base_model_name_or_path=model_args.model_name_or_path,
+        version="2"
     )
 
     # Save Medusa config