FasterDecoding
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 4 deletions b/‎.gitignore‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎medusa/model/medusa_choices.py‎
Lines changed: 0 additions & 1 deletion b/‎medusa/model/medusa_choices.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎medusa/model/medusa_model.py‎
Lines changed: 6 additions & 9 deletions b/‎medusa/model/medusa_model.py‎
Lines changed: 6 additions & 9 deletions
@@ -165,7 +165,4 @@ wandb/
 
 ShareGPT_Vicuna_unfiltered/
 
-test_medusa*
-
-# test
-notebooks/test*.ipynb
+test_medusa*
@@ -2,18 +2,17 @@
 import torch.nn as nn
 from transformers import PreTrainedModel, PretrainedConfig
 from .modeling_llama_kv import LlamaForCausalLM as KVLlamaForCausalLM
+from transformers import AutoTokenizer
 from .utils import *
 from .kv_cache import initialize_past_key_values
-from .medusa_choices import mc_sim_7b_63
-from transformers import AutoTokenizer
 import os
 from huggingface_hub import hf_hub_download
 
 
 class MedusaConfig(PretrainedConfig):
     def __init__(
         self,
-        medusa_num_heads=4,
+        medusa_num_heads=2,
         medusa_num_layers=1,
         base_model_name_or_path="lmsys/vicuna-7b-v1.3",
         **kwargs,
@@ -111,7 +110,6 @@ def get_tokenizer(self):
     def from_pretrained(
         cls,
         medusa_head_name_or_path,
-        medusa_num_heads=None,
         **kwargs,
     ):
         """
@@ -123,12 +121,9 @@ def from_pretrained(
             MedusaModel: A MedusaModel instance loaded from the given path.
         """
         medusa_config = MedusaConfig.from_pretrained(medusa_head_name_or_path)
-        if medusa_num_heads is not None:
-            medusa_config.medusa_num_heads = medusa_num_heads
         base_model = KVLlamaForCausalLM.from_pretrained(
             medusa_config.base_model_name_or_path, **kwargs
         )
-
         model = cls(
             base_model,
             medusa_config.medusa_num_heads,
@@ -196,7 +191,7 @@ def medusa_generate(
         max_steps=512,
         # The hyperparameters below are for the Medusa
         # top-1 prediciton for the next token, top-7 predictions for the next token, top-6 predictions for the next next token.
-        medusa_choices=mc_sim_7b_63,
+        medusa_choices=[1, 7, 6],
         posterior_threshold=0.09,  # threshold validation of Medusa output
         # another threshold hyperparameter, recommended to be sqrt(posterior_threshold)
         posterior_alpha=0.3,
@@ -230,6 +225,7 @@ def medusa_generate(
         self.medusa_buffers = medusa_buffers
         self.medusa_choices = medusa_choices
 
+        medusa_topk = medusa_choices[1:]
 
         # Initialize the past key and value states
         if hasattr(self, "past_key_values"):
@@ -264,8 +260,9 @@ def medusa_generate(
             candidates, tree_candidates = generate_candidates(
                 medusa_logits,
                 logits,
+                medusa_topk,
                 medusa_buffers["tree_indices"],
-                medusa_buffers["retrieve_indices"],
+                temperature,
             )
 
             # Use tree attention to verify the candidates and get predictions