Creating medusa2.

Narsil · Narsil · commit 2b8020f2fff7 · 2024-04-12T10:45:51.000Z
Turns out creating entire weights for the lm_heads costs a huge amount
of VRAM (specially for multilingual models like Gemm) and is not
necessary at all to get good speculation.

This PR modifies the legacy code to create new medusa models without
duplicating this lm_head making it much more efficient to run.
It also increments the version number of the config
so users can know if how to actually run the model.
diff --git a/create_data.py b/create_data.py
@@ -11,12 +11,12 @@
 
 client = httpx.AsyncClient(timeout=None)
 
-async def run(conv: Conversation):
+async def run(conv: Conversation, url: str):
     payload = {"model":"tgi", "messages": conv.messages}
     response = await client.post(url, json=payload)
     content = response.json()
     message = content["choices"][0]["message"]
-    message.pop("name")
+    message.pop("name", None)
     conv.add_message(message)
 
 
@@ -34,15 +34,16 @@ def fix_source(source):
     return new_source
 
 
-async def recreate_conversation(conversation, sem):
+async def recreate_conversation(conversation, sem, url):
     async with sem:
         conv = Conversation()
         try:
             for message in conversation[::2]:
                 assert message["role"] == "user"
                 conv.add_message(message)
-                await run(conv)
-        except Exception:
+                await run(conv, url)
+        except Exception as e:
+            print(e)
             pass
         return conv.messages
 
@@ -62,7 +63,7 @@ async def _main():
 
         futures = []
         for conversation in conversations:
-            future = recreate_conversation(conversation, sem)
+            future = recreate_conversation(conversation, sem, url)
             futures.append(future)
 
         recreated_conversations = await tqdm.asyncio.tqdm.gather(*futures)
diff --git a/medusa/model/medusa_model_legacy.py b/medusa/model/medusa_model_legacy.py
@@ -25,12 +25,14 @@ def __init__(
         self,
         medusa_num_heads=4,
         medusa_num_layers=1,
+        version="2",
         base_model_name_or_path="lmsys/vicuna-7b-v1.3",
         **kwargs,
     ):
         super().__init__(**kwargs)
         self.medusa_num_heads = medusa_num_heads
         self.medusa_num_layers = medusa_num_layers
+        self.version = version
         self.base_model_name_or_path = base_model_name_or_path
 
 
@@ -101,7 +103,6 @@ def __init__(
             [
                 nn.Sequential(
                     *([ResBlock(self.hidden_size)] * medusa_num_layers),
-                    nn.Linear(self.hidden_size, self.vocab_size, bias=False),
                 )
                 for _ in range(medusa_num_heads)
             ]
@@ -110,13 +111,6 @@ def __init__(
         # Ensure medusa_head's dtype and device align with the base_model
         self.medusa_head.to(self.base_model.dtype).to(self.base_model.device)
 
-        import deepspeed
-        params = [base_model.lm_head.weight]
-        with deepspeed.zero.GatheredParameters(params):
-            for i in range(medusa_num_heads):
-                # Initialize the weights of each medusa_head using the base model's weights
-                self.medusa_head[i][-1].weight.data[:] = base_model.lm_head.weight.data[:]
-
     def get_tokenizer(self):
         """Get the tokenizer of the base model.
 
@@ -207,7 +201,9 @@ def forward(
         medusa_logits = []
         # TODO: Consider parallelizing this loop for efficiency?
         for i in range(self.medusa):
-            medusa_logits.append(self.medusa_head[i](hidden_states))
+            mhidden_states = self.medusa_head[i](hidden_states)
+            mlogits = self.base_model.lm_head(mhidden_states)
+            medusa_logits.append(mlogits)
         if output_orig:
             return torch.stack(medusa_logits, dim=0), outputs, orig
         return torch.stack(medusa_logits, dim=0)
diff --git a/medusa/train/train_legacy.py b/medusa/train/train_legacy.py
@@ -335,6 +335,20 @@ def train():
         config.rope_scaling = {"type": "linear", "factor": scaling_factor}
     config.use_cache = False
 
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        model_args.model_name_or_path,
+        cache_dir=training_args.cache_dir,
+        model_max_length=training_args.model_max_length,
+        padding_side="right",
+        use_fast=True,
+    )
+    tokenizer.pad_token = tokenizer.unk_token
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Making sure the tokenizer works before loading the model.
+    print(tokenizer(["This is a test", "secondary"], padding=True))
+    print(tokenizer.apply_chat_template([{"role": "user", "content": "This is a test"}]))
+
     # Load model and tokenizer
     model = transformers.AutoModelForCausalLM.from_pretrained(
         model_args.model_name_or_path,
@@ -358,14 +372,6 @@ def train():
     # Format output dir
     training_args.output_dir = f"{training_args.output_dir}_medusa_mlp_{model_args.model_name_or_path.split('/')[-1]}_medusa_{training_args.medusa_num_heads}_lr_{training_args.learning_rate}_layers_{training_args.medusa_num_layers}"
 
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        model_args.model_name_or_path,
-        cache_dir=training_args.cache_dir,
-        model_max_length=training_args.model_max_length,
-        padding_side="right",
-        use_fast=True,
-    )
-    tokenizer.pad_token = tokenizer.unk_token
 
     # Load data
     data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
@@ -375,6 +381,7 @@ def train():
         medusa_num_heads=training_args.medusa_num_heads,
         medusa_num_layers=training_args.medusa_num_layers,
         base_model_name_or_path=model_args.model_name_or_path,
+        version="2"
     )
 
     # Save Medusa config