FasterDecoding
diff --git a/‎README.md‎
Lines changed: 7 additions & 6 deletions b/‎README.md‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎medusa/inference/cli.py‎
Lines changed: 2 additions & 0 deletions b/‎medusa/inference/cli.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎medusa/model/kv_cache.py‎
Lines changed: 3 additions & 0 deletions b/‎medusa/model/kv_cache.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎medusa/model/medusa_model.py‎
Lines changed: 23 additions & 2 deletions b/‎medusa/model/medusa_model.py‎
Lines changed: 23 additions & 2 deletions
@@ -7,7 +7,8 @@ medusa-llm"><b>Blog</b></a> | <a href="ROADMAP.md"><b>Roadmap</b></a> |
 
 ---
 *News* 🔥
-- [2023/09] Medusa v0.1 is released! 🎉
+- [2023/09] Medusa won the [Chai Prize Grant](https://twitter.com/tianle_cai/status/1703891335147897341)🎉 The prize will be used as a development bounty for those who help us achieve milestones in our [roadmap](https://github.com/FasterDecoding/Medusa/issues/3)!
+- [2023/09] Medusa v0.1 is released! 
 
 ---
 ## Introduction
@@ -78,7 +79,7 @@ In this initial release, our primary focus is on optimizing Medusa for a batch s
 ```bash
 pip install medusa-llm
 ```
-### Method 2: From source
+### Method 2: From the source
 ```bash
 git clone https://github.com/FasterDecoding/Medusa.git
 cd Medusa
@@ -95,11 +96,11 @@ pip install -e .
 ### Inference
 We currently support single-GPU inference with a batch size of 1, which is the most common setup for local model hosting. We are actively working to extend Medusa's capabilities by integrating it into other inference frameworks; please don't hesitate to reach out if you are interested in contributing to this effort.
 
-You can use the following command for launching a CLI interface:
+You can use the following command to launch a CLI interface:
 ```bash
 CUDA_VISIBLE_DEVICES=0 python -m medusa.inference.cli --model [path of medusa model]
 ```
-You can also pass `--load-in-8bit` or `--load-in-4bit` to load the base model in quantized format.
+You can also pass `--load-in-8bit` or `--load-in-4bit` to load the base model in quantized format. If you download the base model elsewhere, you may override base model name or path with `--base-model  [path of base model]`.
 
 ### Training
 For training, please install:
@@ -111,7 +112,7 @@ We take a public version of the ShareGPT dataset, which is a subset of the Vicun
 ```bash
 git clone https://huggingface.co/datasets/Aeala/ShareGPT_Vicuna_unfiltered
 ```
-Remark: If you haven't installed `git-lfs`, please install it before clone:
+Remark: If you haven't installed `git-lfs`, please install it before cloning:
 ```bash
 git lfs install
 ```
@@ -158,7 +159,7 @@ python -m medusa.hf_utils --folder [path of the model folder] --repo [name of th
 ```
 
 ## Codebase Guide
-`medusa/model/medusa_model.py` is the key file for Medusa. It contains the `MedusaModel` class, which is a wrapper of the original model and the new heads. This class also has implementation of a streaming generation method. If you want to dive into the details of Medusa, this is the place to start.
+`medusa/model/medusa_model.py` is the key file for Medusa. It contains the `MedusaModel` class, which is a wrapper of the original model and the new heads. This class also has an implementation of a streaming generation method. If you want to dive into the details of Medusa, this is the place to start.
 
 We also provide some illustrative notebooks in `notebooks/` to help you understand the codebase.
 
 
@@ -36,6 +36,7 @@ def main(args):
     try:
         model = MedusaModel.from_pretrained(
             args.model,
+            args.base_model,
             torch_dtype=torch.float16,
             low_cpu_mem_usage=True,
             device_map="auto",
@@ -185,6 +186,7 @@ def reload_conv(conv):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", type=str, required=True, help="Model name or path.")
+    parser.add_argument("--base-model", type=str, default=None, help="Base model name or path.")
     parser.add_argument(
         "--load-in-8bit", action="store_true", help="Use 8-bit quantization"
     )
 
@@ -4,6 +4,7 @@
 class KVCache:
     """
     A key-value cache for the model.
+
     This class provides a mechanism to maintain a growing cache of keys and values,
     particularly useful for models that benefit from caching previous states,
     like transformers during autoregressive decoding.
@@ -15,6 +16,8 @@ class KVCache:
 
     def __init__(self, data, current_length):
         """
+        Initialize the KVCache.
+
         Args:
             data (torch.Tensor): Initial tensor to store the keys and values.
             current_length (int): Initial length of the data.
 
@@ -11,6 +11,16 @@
 
 
 class MedusaConfig(PretrainedConfig):
+    """
+    Configuration class for Medusa model.
+
+    Args:
+        medusa_num_heads (int, optional): Number of heads for the Medusa layer. Default is 2.
+        medusa_num_layers (int, optional): Number of Medusa layers. Default is 1.
+        base_model_name_or_path (str, optional): The name or path of the base model. Default is "lmsys/vicuna-7b-v1.3".
+        **kwargs: Additional keyword arguments to be passed to the parent class constructor.
+    """
+
     def __init__(
         self,
         medusa_num_heads=4,
@@ -25,10 +35,14 @@ def __init__(
 
 
 class ResBlock(nn.Module):
-    """A Residual Block module.
+    """
+    A Residual Block module.
 
     This module performs a linear transformation followed by a SiLU activation,
     and then adds the result to the original input, creating a residual connection.
+
+    Args:
+        hidden_size (int): The size of the hidden layers in the block.
     """
 
     def __init__(self, hidden_size):
@@ -40,7 +54,8 @@ def __init__(self, hidden_size):
         self.act = nn.SiLU()
 
     def forward(self, x):
-        """Forward pass of the ResBlock.
+        """
+        Forward pass of the ResBlock.
 
         Args:
             x (torch.Tensor): Input tensor.
@@ -112,6 +127,7 @@ def from_pretrained(
         cls,
         medusa_head_name_or_path,
         medusa_num_heads=None,
+        base_model=None,
         **kwargs,
     ):
         """
@@ -124,7 +140,12 @@ def from_pretrained(
         """
         medusa_config = MedusaConfig.from_pretrained(medusa_head_name_or_path)
         if medusa_num_heads is not None:
+            print("Overriding medusa_num_heads as:", medusa_num_heads)
             medusa_config.medusa_num_heads = medusa_num_heads
+        if base_model is not None:
+            print("Overriding base_model as:", base_model)
+            medusa_config.base_model_name_or_path = base_model
+            
         base_model = KVLlamaForCausalLM.from_pretrained(
             medusa_config.base_model_name_or_path, **kwargs
         )