update README and add back legacy code for compatibility

ctlllll · ctlllll · commit 700ff848f4cb · 2024-01-24T18:37:02.000-08:00
diff --git a/README.md b/README.md
@@ -66,14 +66,13 @@ We also add support for self-distillation, which allows us to add Medusa to any
 - [Introduction](#introduction)
 - [Contents](#contents)
 - [Installation](#installation)
-  - [Method 1: With pip](#method-1-with-pip)
-  - [Method 2: From source (recommended)](#method-2-from-source)
+  - [Method 1: With pip (may not be the latest version)](#method-1-with-pip-may-not-be-the-latest-version)
+  - [Method 2: From the source (recommended)](#method-2-from-the-source-recommended)
   - [Model Weights](#model-weights)
   - [Inference](#inference)
   - [Training](#training)
-    - [Prepare the data](#prepare-the-data)
-    - [Train the model](#train-the-model)
-    - [Push to Hugging Face Hub](#push-to-hugging-face-hub)
+  - [Training (legacy)](#training-legacy)
+  - [Push to Hugging Face Hub](#push-to-hugging-face-hub)
 - [Citation](#citation)
 - [Codebase Guide](#codebase-guide)
 - [Community Adoption](#community-adoption)
@@ -119,11 +118,16 @@ CUDA_VISIBLE_DEVICES=0 python -m medusa.inference.cli --model [path of medusa mo
 You can also pass `--load-in-8bit` or `--load-in-4bit` to load the base model in quantized format. If you download the base model elsewhere, you may override base model name or path with `--base-model  [path of base model]`.
 
 ### Training
-In the updated version, we use the amazing [axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) library to manage the training process. Please refer to our [fork](https://github.com/ctlllll/axolotl) for the training code. The major code modifications are in [`src/axolotl/utils/models.py`](https://github.com/ctlllll/axolotl/blob/main/src/axolotl/utils/models.py). The training configs can be found in [`examples/medusa`](https://github.com/ctlllll/axolotl/tree/main/examples/medusa).
+In the updated version, we use the amazing [axolotl](https://github.com/OpenAccess-AI-Collective/axolotl) library to manage the training process. Please refer to our [fork](https://github.com/ctlllll/axolotl) for the training code. The major code modifications are in [`src/axolotl/utils/models.py`](https://github.com/ctlllll/axolotl/blob/main/src/axolotl/utils/models.py). The training configs can be found in [`examples/medusa`](https://github.com/ctlllll/axolotl/tree/main/examples/medusa). A typical training command is as follows:
+```bash
+accelerate launch -m axolotl.cli.train examples/medusa/your_config.yml
+```
 
-The data preparation code for self-distillation can be found in [`data_generation` folder](data_generation) of the current repo.
+The data preparation code for self-distillation can be found in [`data_generation` folder](data_generation) of the current repo. For other datasets, you can directly download the data from the corresponding Hugging Face dataset repo.
 
 ### Training (legacy)
+*The following instructions are for the initial release of Medusa, it provides a minimal example of how to train a Medusa-1 model. For the updated version, please refer to the previous section.*
+
 For training, please install:
 ```bash
 pip install -e ".[train]"
@@ -161,7 +165,7 @@ torchrun --nproc_per_node=4 medusa/train/train.py --model_name_or_path lmsys/vic
     --medusa_num_heads 3 \
     --medusa_num_layers 1
 ```
-#### Push to Hugging Face Hub
+### Push to Hugging Face Hub
 You can use the following command to push your model to the Hugging Face Hub:
 ```bash
 python -m medusa.hf_utils --folder [path of the model folder] --repo [name of the repo]
diff --git a/medusa/model/medusa_model_legacy.py b/medusa/model/medusa_model_legacy.py
@@ -0,0 +1,332 @@
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel, PretrainedConfig
+from .modeling_llama_kv import LlamaForCausalLM as KVLlamaForCausalLM
+from .utils import *
+from .kv_cache import initialize_past_key_values
+from .medusa_choices import mc_sim_7b_63
+from transformers import AutoTokenizer
+import os
+from huggingface_hub import hf_hub_download
+
+
+class MedusaConfig(PretrainedConfig):
+    """
+    Configuration class for Medusa model.
+
+    Args:
+        medusa_num_heads (int, optional): Number of heads for the Medusa layer. Default is 2.
+        medusa_num_layers (int, optional): Number of Medusa layers. Default is 1.
+        base_model_name_or_path (str, optional): The name or path of the base model. Default is "lmsys/vicuna-7b-v1.3".
+        **kwargs: Additional keyword arguments to be passed to the parent class constructor.
+    """
+
+    def __init__(
+        self,
+        medusa_num_heads=4,
+        medusa_num_layers=1,
+        base_model_name_or_path="lmsys/vicuna-7b-v1.3",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.medusa_num_heads = medusa_num_heads
+        self.medusa_num_layers = medusa_num_layers
+        self.base_model_name_or_path = base_model_name_or_path
+
+
+class ResBlock(nn.Module):
+    """
+    A Residual Block module.
+
+    This module performs a linear transformation followed by a SiLU activation,
+    and then adds the result to the original input, creating a residual connection.
+
+    Args:
+        hidden_size (int): The size of the hidden layers in the block.
+    """
+
+    def __init__(self, hidden_size):
+        super().__init__()
+        self.linear = nn.Linear(hidden_size, hidden_size)
+        # Initialize as an identity mapping
+        torch.nn.init.zeros_(self.linear.weight)
+        # Use SiLU activation to keep consistent with the Llama model
+        self.act = nn.SiLU()
+
+    def forward(self, x):
+        """
+        Forward pass of the ResBlock.
+
+        Args:
+            x (torch.Tensor): Input tensor.
+
+        Returns:
+            torch.Tensor: Output after the residual connection and activation.
+        """
+        return x + self.act(self.linear(x))
+
+
+class MedusaModel(nn.Module):
+    """The Medusa Language Model Head.
+
+    This module creates a series of prediction heads (based on the 'medusa' parameter)
+    on top of a given base model. Each head is composed of a sequence of residual blocks
+    followed by a linear layer.
+    """
+
+    def __init__(
+        self,
+        base_model,
+        medusa_num_heads=4,
+        medusa_num_layers=1,
+        base_model_name_or_path="lmsys/vicuna-7b-v1.3",
+    ):
+        """
+        Args:
+            base_model (nn.Module): The base language model to be used.
+            medusa_num_heads (int, optional): Number of additional tokens to predict. Defaults to 3.
+            medusa_num_layers (int, optional): Number of ResBlock layers for each Medusa head. Defaults to 0.
+        """
+        super().__init__()
+        self.base_model = base_model
+        self.config = base_model.config
+        self.hidden_size = base_model.lm_head.weight.shape[-1]  
+        self.vocab_size = base_model.lm_head.weight.shape[0]
+        self.medusa = medusa_num_heads
+        self.medusa_num_layers = medusa_num_layers
+        self.base_model_name_or_path = base_model_name_or_path
+        self.tokenizer = AutoTokenizer.from_pretrained(self.base_model_name_or_path)
+        # Create a list of Medusa heads
+        self.medusa_head = nn.ModuleList(
+            [
+                nn.Sequential(
+                    *([ResBlock(self.hidden_size)] * medusa_num_layers),
+                    nn.Linear(self.hidden_size, self.vocab_size, bias=False),
+                )
+                for _ in range(medusa_num_heads)
+            ]
+        )
+
+        # Ensure medusa_head's dtype and device align with the base_model
+        self.medusa_head.to(self.base_model.dtype).to(self.base_model.device)
+
+        for i in range(medusa_num_heads):
+            # Initialize the weights of each medusa_head using the base model's weights
+            self.medusa_head[i][-1].weight.data[:] = base_model.lm_head.weight.data[:]
+
+    def get_tokenizer(self):
+        """Get the tokenizer of the base model.
+
+        Returns:
+            Tokenizer: The tokenizer of the base model.
+        """
+        return self.tokenizer
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        medusa_head_name_or_path,
+        base_model=None,
+        medusa_num_heads=None,
+        **kwargs,
+    ):
+        """
+        Args:
+            medusa_head_name_or_path (str): Name or path of the Medusa head to load.
+            **kwargs: Additional keyword arguments for loading the base model.
+
+        Returns:
+            MedusaModel: A MedusaModel instance loaded from the given path.
+        """
+        medusa_config = MedusaConfig.from_pretrained(medusa_head_name_or_path)
+        if medusa_num_heads is not None:
+            print("Overriding medusa_num_heads as:", medusa_num_heads)
+            medusa_config.medusa_num_heads = medusa_num_heads
+        if base_model is not None:
+            print("Overriding base_model as:", base_model)
+            medusa_config.base_model_name_or_path = base_model
+            
+        base_model = KVLlamaForCausalLM.from_pretrained(
+            medusa_config.base_model_name_or_path, **kwargs
+        )
+
+        model = cls(
+            base_model,
+            medusa_config.medusa_num_heads,
+            medusa_config.medusa_num_layers,
+            medusa_config.base_model_name_or_path,
+        )
+        medusa_head_path = os.path.join(medusa_head_name_or_path, "medusa_lm_head.pt")
+        if os.path.exists(medusa_head_path):
+            filename = medusa_head_path
+        else:
+            filename = hf_hub_download(medusa_head_name_or_path, "medusa_lm_head.pt")
+        medusa_head_state_dict = torch.load(filename, map_location=base_model.device)
+        model.medusa_head.load_state_dict(medusa_head_state_dict, strict=False)
+
+        return model
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        output_orig=False,
+        position_ids=None,
+    ):
+        """Forward pass of the MedusaModel.
+
+        Args:
+            input_ids (torch.Tensor, optional): Input token IDs.
+            attention_mask (torch.Tensor, optional): Attention mask.
+            labels (torch.Tensor, optional): Ground truth labels for loss computation.
+            past_key_values (tuple, optional): Tuple containing past key and value states for attention.
+            output_orig (bool, optional): Whether to also output predictions from the original LM head.
+            position_ids (torch.Tensor, optional): Position IDs.
+
+        Returns:
+            torch.Tensor: A tensor containing predictions from all Medusa heads.
+            (Optional) Original predictions from the base model's LM head.
+        """
+        with torch.inference_mode():
+            # Pass input through the base model
+            outputs = self.base_model.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                past_key_values=past_key_values,
+                position_ids=position_ids,
+            )
+            if output_orig:
+                orig = self.base_model.lm_head(outputs[0])
+        # Clone the output hidden states
+        hidden_states = outputs[0].clone()
+        medusa_logits = []
+        # TODO: Consider parallelizing this loop for efficiency?
+        for i in range(self.medusa):
+            medusa_logits.append(self.medusa_head[i](hidden_states))
+        if output_orig:
+            return torch.stack(medusa_logits, dim=0), outputs, orig
+        return torch.stack(medusa_logits, dim=0)
+
+    def medusa_generate(
+        self,
+        input_ids,
+        attention_mask=None,
+        temperature=0.0,
+        max_steps=512,
+        # The hyperparameters below are for the Medusa
+        # top-1 prediciton for the next token, top-7 predictions for the next token, top-6 predictions for the next next token.
+        medusa_choices=mc_sim_7b_63,
+        posterior_threshold=0.09,  # threshold validation of Medusa output
+        # another threshold hyperparameter, recommended to be sqrt(posterior_threshold)
+        posterior_alpha=0.3,
+    ):
+        """
+        Args:
+            input_ids (torch.Tensor, optional): Input token IDs.
+            attention_mask (torch.Tensor, optional): Attention mask.
+            temperature (float, optional): Temperature for typical acceptance.
+            medusa_choices (list, optional): A list of integers indicating the number of choices for each Medusa head.
+            posterior_threshold (float, optional): Threshold for posterior validation.
+            posterior_alpha (float, optional): Another threshold hyperparameter, recommended to be sqrt(posterior_threshold).
+        Returns:
+            torch.Tensor: Output token IDs.
+
+        Warning: Only support batch size 1 for now!!
+        """
+        assert input_ids.shape[0] == 1, "Only support batch size 1 for now!!"
+        # Avoid modifying the input_ids in-place
+        input_ids = input_ids.clone()
+
+        # Cache medusa buffers (the fixed patterns for tree attention)
+        if hasattr(self, "medusa_choices") and self.medusa_choices == medusa_choices:
+            # Load the cached medusa buffer
+            medusa_buffers = self.medusa_buffers
+        else:
+            # Initialize the medusa buffer
+            medusa_buffers = generate_medusa_buffers(
+                medusa_choices, device=self.base_model.device
+            )
+        self.medusa_buffers = medusa_buffers
+        self.medusa_choices = medusa_choices
+
+
+        # Initialize the past key and value states
+        if hasattr(self, "past_key_values"):
+            past_key_values = self.past_key_values
+            past_key_values_data = self.past_key_values_data
+            current_length_data = self.current_length_data
+            # Reset the past key and value states
+            current_length_data.zero_()
+        else:
+            (
+                past_key_values,
+                past_key_values_data,
+                current_length_data,
+            ) = initialize_past_key_values(self.base_model)
+            self.past_key_values = past_key_values
+            self.past_key_values_data = past_key_values_data
+            self.current_length_data = current_length_data
+
+        input_len = input_ids.shape[1]
+
+        reset_medusa_mode(self)
+        # Initialize tree attention mask and process prefill tokens
+        medusa_logits, logits = initialize_medusa(
+            input_ids, self, medusa_buffers["medusa_attn_mask"], past_key_values
+        )
+
+        new_token = 0
+        last_round_token = 0
+
+        for idx in range(max_steps):
+            # Generate candidates with topk predictions from Medusa heads
+            candidates, tree_candidates = generate_candidates(
+                medusa_logits,
+                logits,
+                medusa_buffers["tree_indices"],
+                medusa_buffers["retrieve_indices"],
+            )
+
+            # Use tree attention to verify the candidates and get predictions
+            medusa_logits, logits, outputs = tree_decoding(
+                self,
+                tree_candidates,
+                past_key_values,
+                medusa_buffers["medusa_position_ids"],
+                input_ids,
+                medusa_buffers["retrieve_indices"],
+            )
+
+            # Evaluate the posterior of the candidates to select the accepted candidate prefix
+            best_candidate, accept_length = evaluate_posterior(
+                logits, candidates, temperature, posterior_threshold, posterior_alpha
+            )
+
+            # Update the input_ids and logits
+            input_ids, logits, medusa_logits, new_token = update_inference_inputs(
+                input_ids,
+                candidates,
+                best_candidate,
+                accept_length,
+                medusa_buffers["retrieve_indices"],
+                outputs,
+                logits,
+                medusa_logits,
+                new_token,
+                past_key_values_data,
+                current_length_data,
+            )
+
+            yield {
+                "text": self.tokenizer.decode(
+                    input_ids[0, input_len:],
+                    skip_special_tokens=True,
+                    spaces_between_special_tokens=False,
+                    clean_up_tokenization_spaces=True,
+                )
+            }
+
+            if self.tokenizer.eos_token_id in input_ids[0, input_len:]:
+                break
diff --git a/medusa/model/modeling_llama_kv_legacy.py b/medusa/model/modeling_llama_kv_legacy.py