Adding recipe for other models (non llama, non vicuna).

Narsil · Narsil · commit 0bfdcd237f7d · 2024-02-24T15:52:03.000+01:00
diff --git a/README.md b/README.md
@@ -125,7 +125,7 @@ accelerate launch -m axolotl.cli.train examples/medusa/your_config.yml
 
 The data preparation code for self-distillation can be found in [`data_generation` folder](data_generation) of the current repo. For other datasets, you can directly download the data from the corresponding Hugging Face dataset repo.
 
-### Training (legacy)
+### Training on various architectures
 *The following instructions are for the initial release of Medusa, it provides a minimal example of how to train a Medusa-1 model. For the updated version, please refer to the previous section.*
 
 For training, please install:
@@ -141,14 +141,36 @@ Remark: If you haven't installed `git-lfs`, please install it before cloning:
 ```bash
 git lfs install
 ```
+
+#### Adapt the data to the model you want to enable medusa on.
+
+Start by launch an inference server you like that will run the model you want to train on.
+Let's use [mistralai/Mistral-7B-Instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) as an example.
+
+For instance you can use [text-generation-inference](https://github.com/huggingface/text-generation-inference), which you
+can also use after you've trained the medusa heads.
+
+```
+model=mistralai/Mistral-7B-Instruct-v0.2
+volume=$PWD/data # share a volume with the Docker container to avoid downloading weights every run
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id $model --input-length 4000 --max-total-tokens 4096 --max-batch-prefill-tokens 4000
+```
+The sequences in shareGPT are relatively long for some, so make sure you can infer on those. If you do not have enough room, the script will simply ignore those long conversation.
+It shouldn't impact too much downstream performance, but more data is always better.
+You can use various tradeoffs to [speed up inference](https://huggingface.co/docs/text-generation-inference/index) but the defaults show be good enough in most cases.
+
+```
+python create_data.py --input-filename ShareGPT_Vicuna_unfiltered/ShareGPT_V4.3_unfiltered_cleaned_split.json --output-filename mistral.json
+```
+
 #### Train the model
 We follow the training setup from [FastChat](https://github.com/lm-sys/FastChat#fine-tuning), but with a much larger learning rate because we freeze the original model and only train the new heads. Here is the training command for the Vicuna-7b model on 4 GPUs. Since we are only training the new heads, the training does not require a lot of memory, and only data parallelism is needed. You can modify the script to fit your own setup. For larger models, we use the same setup. You can also use `--load_in_8bit` or `--load_in_4bit` to load the base model in quantized format.
 ```bash
-torchrun --nproc_per_node=4 medusa/train/train.py --model_name_or_path lmsys/vicuna-7b-v1.3 \
-    --data_path ShareGPT_Vicuna_unfiltered/ShareGPT_V4.3_unfiltered_cleaned_split.json \
+torchrun --nproc_per_node=4 medusa/train/train_legacy.py --model_name_or_path mistralai/Mistral-7B-Instruct-v0.2 \
+    --data_path mistral.json \
     --bf16 True \
     --output_dir test \
-    --num_train_epochs 1 \
+    --num_train_epochs 2 \
     --per_device_train_batch_size 8 \
     --per_device_eval_batch_size 8 \
     --gradient_accumulation_steps 4 \
@@ -163,7 +185,8 @@ torchrun --nproc_per_node=4 medusa/train/train.py --model_name_or_path lmsys/vic
     --model_max_length 2048 \
     --lazy_preprocess True \
     --medusa_num_heads 3 \
-    --medusa_num_layers 1
+    --medusa_num_layers 1 \
+    --deepspeed deepspeed.json
 ```
 ### Push to Hugging Face Hub
 You can use the following command to push your model to the Hugging Face Hub:
diff --git a/create_data.py b/create_data.py
@@ -0,0 +1,76 @@
+import typer
+import json
+from transformers import Conversation
+from typing_extensions import Annotated
+import httpx
+import tqdm
+import asyncio
+
+app = typer.Typer()
+
+
+client = httpx.AsyncClient(timeout=None)
+
+async def run(conv: Conversation):
+    payload = {"model":"tgi", "messages": conv.messages}
+    response = await client.post(url, json=payload)
+    content = response.json()
+    message = content["choices"][0]["message"]
+    message.pop("name")
+    conv.add_message(message)
+
+
+
+
+def fix_source(source):
+    if source and source[0]["from"] == "gpt":
+        # Skip if GPT is first to talk
+        source = source[1:]
+    new_source = []
+    for item in source:
+        role = "assistant" if item["from"] == "gpt" else "user"
+        content = item["value"]
+        new_source.append({"role": role, "content": content})
+    return new_source
+
+
+async def recreate_conversation(conversation, sem):
+    async with sem:
+        conv = Conversation()
+        try:
+            for message in conversation[::2]:
+                assert message["role"] == "user"
+                conv.add_message(message)
+                await run(conv)
+        except Exception:
+            pass
+        return conv.messages
+
+@app.command()
+def main(
+    *,
+    input_filename: Annotated[str, typer.Option("--input-filename")],
+    output_filename: Annotated[str, typer.Option("--output-filename")],
+    url: Annotated[str, typer.Option("--url") = "http://localhost:8080/v1/chat/completions",
+    concurrency: Annotated[int, typer.Option("--concurrency") = 64
+):
+    sem = asyncio.Semaphore(concurrency)
+    async def _main():
+        with open(input_filename, "r") as f:
+            input_data = json.loads(f.read())
+        conversations = [fix_source(source["conversations"]) for source in input_data]
+
+        futures = []
+        for conversation in conversations:
+            future = recreate_conversation(conversation, sem)
+            futures.append(future)
+
+        recreated_conversations = await tqdm.asyncio.tqdm.gather(*futures)
+
+        with open(output_filename, "w") as f:
+            json.dump(recreated_conversations, f, indent=4)
+    asyncio.run(_main())
+
+
+if __name__ == "__main__":
+    app()
diff --git a/medusa/model/medusa_model_legacy.py b/medusa/model/medusa_model_legacy.py
@@ -90,8 +90,8 @@ def __init__(
         super().__init__()
         self.base_model = base_model
         self.config = base_model.config
-        self.hidden_size = base_model.lm_head.weight.shape[-1]  
-        self.vocab_size = base_model.lm_head.weight.shape[0]
+        self.hidden_size = base_model.config.hidden_size
+        self.vocab_size = base_model.config.vocab_size
         self.medusa = medusa_num_heads
         self.medusa_num_layers = medusa_num_layers
         self.base_model_name_or_path = base_model_name_or_path
@@ -110,9 +110,12 @@ def __init__(
         # Ensure medusa_head's dtype and device align with the base_model
         self.medusa_head.to(self.base_model.dtype).to(self.base_model.device)
 
-        for i in range(medusa_num_heads):
-            # Initialize the weights of each medusa_head using the base model's weights
-            self.medusa_head[i][-1].weight.data[:] = base_model.lm_head.weight.data[:]
+        import deepspeed
+        params = [base_model.lm_head.weight]
+        with deepspeed.zero.GatheredParameters(params):
+            for i in range(medusa_num_heads):
+                # Initialize the weights of each medusa_head using the base model's weights
+                self.medusa_head[i][-1].weight.data[:] = base_model.lm_head.weight.data[:]
 
     def get_tokenizer(self):
         """Get the tokenizer of the base model.
@@ -189,7 +192,7 @@ def forward(
             torch.Tensor: A tensor containing predictions from all Medusa heads.
             (Optional) Original predictions from the base model's LM head.
         """
-        with torch.inference_mode():
+        with torch.no_grad():
             # Pass input through the base model
             outputs = self.base_model.model(
                 input_ids=input_ids,
diff --git a/medusa/train/train_legacy.py b/medusa/train/train_legacy.py
@@ -29,6 +29,7 @@
 import transformers
 from transformers import Trainer, BitsAndBytesConfig
 from transformers.trainer_pt_utils import LabelSmoother
+from safetensors.torch import save_file
 
 from fastchat.conversation import SeparatorStyle
 from fastchat.model.model_adapter import get_conversation_template
@@ -80,7 +81,7 @@ def compute_loss(self, model, inputs, return_outputs=False):
             medusa_labels = medusa_labels[not_ignore]
 
             # Add top-k accuracy
-            for k in range(1, 6):
+            for k in range(1, 2):
                 _, topk = medusa_logits.topk(k, dim=-1)
                 topk = topk[not_ignore]
                 correct = topk.eq(medusa_labels.unsqueeze(-1)).any(-1)
@@ -119,6 +120,7 @@ class DataArguments:
 @dataclass
 class TrainingArguments(transformers.TrainingArguments):
     cache_dir: Optional[str] = field(default=None)
+    report_to: Optional[str] = None
     optim: str = field(default="adamw_torch")
     model_max_length: int = field(
         default=2048,
@@ -158,7 +160,6 @@ def safe_save_model_for_hf_trainer(trainer: transformers.Trainer, output_dir: st
         del state_dict
         trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
 
-
 def preprocess(
     sources,
     tokenizer: transformers.PreTrainedTokenizer,
@@ -173,73 +174,43 @@ def preprocess(
     Returns:
         Dict: A dictionary containing tokenized inputs, labels, and attention mask.
     """
-    conv = get_conversation_template("vicuna")
-    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
 
     # Apply prompt templates
     conversations = []
-    for i, source in enumerate(sources):
-        if roles[source[0]["from"]] != conv.roles[0]:
-            # Skip the first one if it is not from human
-            source = source[1:]
-
-        conv.messages = []
-        for j, sentence in enumerate(source):
-            role = roles[sentence["from"]]
-            assert role == conv.roles[j % 2], f"{i}, {j}, {role}, {conv.roles[j % 2]}"
-            conv.append_message(role, sentence["value"])
-        conversations.append(conv.get_prompt())
+    prompts = []
+    # # import pdb; pdb.set_trace()
+    for i, conversation in enumerate(sources):
+        prompt = tokenizer.apply_chat_template(conversation, tokenize=False)
+        prompts.append(prompt)
+        conversations.append(conversation)
 
     # Tokenize conversations
-    input_ids = tokenizer(
-        conversations,
+    encoding = tokenizer(
+        prompts,
         return_tensors="pt",
         padding="max_length",
-        max_length=tokenizer.model_max_length,
         truncation=True,
-    ).input_ids
-    targets = input_ids.clone()
-
-    assert conv.sep_style == SeparatorStyle.ADD_COLON_TWO
+        return_offsets_mapping=True,
+    )
+    # Set everything to be ignored, except the assistant part
+    targets = torch.full_like(encoding.input_ids, IGNORE_TOKEN_ID)
+    input_ids = encoding.input_ids
 
     # Mask targets. Only compute loss on the assistant outputs.
-    sep = conv.sep + conv.roles[1] + ": "
-    for conversation, target in zip(conversations, targets):
-        total_len = int(target.ne(tokenizer.pad_token_id).sum())
-
-        turns = conversation.split(conv.sep2)
-        cur_len = 1
-        target[:cur_len] = IGNORE_TOKEN_ID
-        for i, turn in enumerate(turns):
-            if turn == "":
-                break
-            turn_len = len(tokenizer(turn).input_ids)
-
-            parts = turn.split(sep)
-            if len(parts) != 2:
-                break
-            parts[0] += sep
-            # "-2" is hardcoded for the LLaMA tokenizer to make the offset correct.
-            instruction_len = len(tokenizer(parts[0]).input_ids) - 2
-
-            # Ignore the user instructions
-            target[cur_len : cur_len + instruction_len] = IGNORE_TOKEN_ID
-            cur_len += turn_len
-
-        target[cur_len:] = IGNORE_TOKEN_ID
-
-        if False:  # Inspect and check the correctness of masking
-            z = target.clone()
-            z = torch.where(z == IGNORE_TOKEN_ID, tokenizer.unk_token_id, z)
-            rank0_print(tokenizer.decode(z))
-
-        if cur_len < tokenizer.model_max_length:
-            if cur_len != total_len:
-                target[:] = IGNORE_TOKEN_ID
-                rank0_print(
-                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
-                    f" (ignored)"
-                )
+    for conv_index, (conversation, target, prompt) in enumerate(zip(conversations, targets, prompts)):
+
+        for turn in conversation:
+            if turn["role"] == "assistant":
+                content = turn["content"]
+                # Unfortunate strip() necessary because chat templates are doing the same.
+                start = prompt.index(content.strip())
+                stop = start + len(content)
+                indices= []
+                for tok_index, (tok_start, tok_stop) in enumerate(encoding.offset_mapping[conv_index]):
+                    if tok_stop >= start or tok_start < tok_stop:
+                        indices.append(tok_index)
+                target[indices] = encoding.input_ids[conv_index][indices]
+
 
     return dict(
         input_ids=input_ids,
@@ -260,7 +231,7 @@ def __init__(self, raw_data, tokenizer: transformers.PreTrainedTokenizer):
         super(SupervisedDataset, self).__init__()
 
         rank0_print("Formatting inputs...")
-        sources = [example["conversations"] for example in raw_data]
+        sources = raw_data
         data_dict = preprocess(sources, tokenizer)
 
         self.input_ids = data_dict["input_ids"]
@@ -304,7 +275,7 @@ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
         if i in self.cached_data_dict:
             return self.cached_data_dict[i]
 
-        ret = preprocess([self.raw_data[i]["conversations"]], self.tokenizer)
+        ret = preprocess([self.raw_data[i]], self.tokenizer)
         ret = dict(
             input_ids=ret["input_ids"][0],
             labels=ret["labels"][0],
@@ -364,23 +335,12 @@ def train():
         config.rope_scaling = {"type": "linear", "factor": scaling_factor}
     config.use_cache = False
 
-    quantization_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_compute_dtype=torch.bfloat16,
-        bnb_4bit_use_double_quant=True,
-        bnb_4bit_quant_type="nf4",
-    )
-
     # Load model and tokenizer
     model = transformers.AutoModelForCausalLM.from_pretrained(
         model_args.model_name_or_path,
         config=config,
         cache_dir=training_args.cache_dir,
-        low_cpu_mem_usage=True,
         torch_dtype=torch.bfloat16,
-        quantization_config=quantization_config if model_args.load_in_4bit else None,
-        load_in_4bit=model_args.load_in_4bit,
-        load_in_8bit=model_args.load_in_8bit,
     )
 
     # Freeze the base model
@@ -403,7 +363,7 @@ def train():
         cache_dir=training_args.cache_dir,
         model_max_length=training_args.model_max_length,
         padding_side="right",
-        use_fast=False,
+        use_fast=True,
     )
     tokenizer.pad_token = tokenizer.unk_token
 
@@ -420,7 +380,6 @@ def train():
     # Save Medusa config
     medusa_config.save_pretrained(training_args.output_dir)
 
-    # import pdb; pdb.set_trace()
     # Start trainner
     trainer = CustomizedTrainer(
         model=medusa_lm_head, tokenizer=tokenizer, args=training_args, **data_module
@@ -438,12 +397,19 @@ def train():
         lm_head = medusa_lm_head.module.medusa_head
     else:
         lm_head = medusa_lm_head.medusa_head
+    import deepspeed
+    with deepspeed.zero.GatheredParameters(lm_head.parameters()):
+        state_dict = lm_head.state_dict()
 
     # Save Medusa heads
-    torch.save(
-        lm_head.state_dict(),
-        os.path.join(training_args.output_dir, "medusa_lm_head.pt"),
-    )
+    if local_rank == 0:
+        # Modify the tokenizer internal state before saving.
+        tokenizer.encode("Test", truncation=None, padding="do_not_pad")
+        tokenizer.save_pretrained(training_args.output_dir)
+        save_file(
+            state_dict,
+            os.path.join(training_args.output_dir, "medusa_lm_head.safetensors"),
+        )
 
 
 if __name__ == "__main__":