update medusa eval

leeyeehoo · leeyeehoo · commit 6294228e2667 · 2023-10-28T06:25:53.000Z
diff --git a/.gitignore b/.gitignore
@@ -172,4 +172,5 @@ notebooks/test*.ipynb
 notebooks/*.pdf
 llm_judge/*.sh
 llm_judge/data/mt_bench_test
-data
+data
+medusa/eval/*.sh
diff --git a/medusa/eval/README.md b/medusa/eval/README.md
@@ -0,0 +1,27 @@
+
+We use [Alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca_eval/blob/0cd24d711fe90d0c1aae5bde03fe98ee48ae52f8/alpaca_eval.json) dataset for evaluating each head's accuracy during generation in `heads_accuracy.py`.
+
+```
+python heads_accuracy.py --model_path 'FasterDecoding/medusa-vicuna-7b-v1.3' --model_name 'medusa-vicuna-7b-v1.3' --medusa_num_heads 5 --data_path '../../data/alpaca_eval.json'
+```
+
+
+To create the tree and plot the tree (requires `pygraphviz` package), please run:
+
+```
+python gen_results.py --accuracy-path '../../data/medusa-vicuna-7b-v1.3_heads_accuracy.pt' --output-path '../../data/graph.jpg'
+```
+
+If you want to use the tree, please add the generated tree (in a nested tuple) to `../model/medusa_choices.py`.
+
+Citation:
+
+```
+@misc{alpaca_eval,
+  author = {Xuechen Li and Tianyi Zhang and Yann Dubois and Rohan Taori and Ishaan Gulrajani and Carlos Guestrin and Percy Liang and Tatsunori B. Hashimoto },
+  title = {AlpacaEval: An Automatic Evaluator of Instruction-following Models},
+  year = {2023},
+  publisher = {GitHub},
+  journal = {GitHub repository},
+  howpublished = {\url{https://github.com/tatsu-lab/alpaca_eval}}
+}```
diff --git a/medusa/eval/gen_results.py b/medusa/eval/gen_results.py
@@ -0,0 +1,99 @@
+import matplotlib.pyplot as plt
+import copy
+import networkx as nx
+import torch
+import argparse
+
+def load_accuracy_table(path):
+    test_accuracy = torch.load(path)
+    accuracy_table = []
+    for i in range(len(test_accuracy)):
+        accuracy_table.append(test_accuracy[i].sum(0)/16100)
+    return torch.stack(accuracy_table)
+
+def get_node_expectation(accuracies, node):
+    expectation = copy.deepcopy(accuracies[0, node[0]])
+    for i in range(1, len(node)):
+        expectation *= accuracies[i, node[i]]
+    return expectation
+
+def explore_graph(accuracies, max_depth, max_child, num_iterations):
+    explored_nodes = {}
+    accept_nodes = [tuple([0])]
+    expectations = get_node_expectation(accuracies, accept_nodes[0])
+    explored_nodes[tuple(accept_nodes[0])] = expectations
+    
+    for _ in range(num_iterations):
+        # find neighbors
+        neighbors = []
+        for node in accept_nodes:
+            if node[-1] < max_child[len(node) - 1] - 1:
+                neighbor = list(copy.deepcopy(node))
+                neighbor[-1] = neighbor[-1] + 1
+                neighbors.append(neighbor)
+            if len(node) < max_depth:
+                neighbor = list(copy.deepcopy(node))
+                neighbor.append(0)
+                neighbors.append(neighbor)
+                
+        # find the best neighbor
+        best_neighbor = None
+        best_neighbor_expectation = 0
+        for neighbor in neighbors:
+            if tuple(neighbor) in accept_nodes:
+                continue
+            if tuple(neighbor) in explored_nodes:
+                neighbor_expectation = explored_nodes[tuple(neighbor)]
+            else:
+                neighbor_expectation = get_node_expectation(accuracies, neighbor)
+                explored_nodes[tuple(neighbor)] = neighbor_expectation
+            if neighbor_expectation > best_neighbor_expectation:
+                best_neighbor = neighbor
+                best_neighbor_expectation = neighbor_expectation
+        accept_nodes.append(tuple(best_neighbor))
+        expectations += best_neighbor_expectation
+        
+    return accept_nodes
+
+def plot_and_save_graph(accept_nodes, output_path):
+    plt.figure(figsize=(40, 20)) 
+
+    G = nx.DiGraph()
+
+    for path in accept_nodes:
+        for i in range(len(path)):
+            if i == 0:
+                parent = 'root'
+            else:
+                parent = tuple(path[:i])
+            child = tuple(path[:i+1])
+            G.add_edge(parent, child)
+
+    pos = nx.nx_agraph.graphviz_layout(G, prog='dot')
+    nx.draw(G, pos, with_labels=True, node_size=500, node_color="skyblue", font_size=10, width=2, edge_color="gray")
+    plt.savefig(output_path)
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate Results.")
+    parser.add_argument('--accuracy-path', type=str, required=True, help="Path to load accuracy tensor.")
+    parser.add_argument('--output-path', type=str, required=True, help="Path to save the generated graph.")
+    parser.add_argument('--max-depth', type=int, default=5, help="Maximum depth of the graph.")
+    parser.add_argument('--num-iterations', type=int, default=62, help="Number of exploration iterations.")
+    parser.add_argument('--max-child', nargs='+', type=int, default=[10, 10, 10, 10, 10], help="Maximum number of children per depth.")
+
+    args = parser.parse_args()
+
+    accuracies = load_accuracy_table(args.accuracy_path)
+    accept_nodes = explore_graph(accuracies, args.max_depth, args.max_child, args.num_iterations)
+    
+    print("Accepted Nodes:", accept_nodes)
+    
+    try:
+        plot_and_save_graph(accept_nodes, args.output_path)
+        print(f"Graph saved to {args.output_path}.")
+    except Exception as e:
+        print(f"Failed to save the graph due to the following error: {e}")
+        print("Ensure that Graphviz and pygraphviz are installed and set up correctly.")
+
+if __name__ == "__main__":
+    main()
diff --git a/medusa/eval/heads_accuracy.py b/medusa/eval/heads_accuracy.py
@@ -0,0 +1,108 @@
+import os
+import torch
+import json
+from contextlib import contextmanager
+import numpy as np
+from medusa.model.medusa_model import MedusaModel
+from medusa.model.kv_cache import *
+from medusa.model.utils import *
+from medusa.model.medusa_choices import *
+from copy import deepcopy
+import matplotlib.pyplot as plt
+import torch.nn.functional as F
+from fastchat.model.model_adapter import get_conversation_template
+from tqdm import tqdm
+import argparse
+
+def get_accuracies(medusa, logit):
+    # get the correct counts of each head
+    seq_len, choices, topk = medusa.shape
+    results = []
+    for choice in range(choices):
+        results.append(medusa[:-choice - 1,choice].eq(logit[choice + 1:,0]))
+    return results
+
+
+
+def main(args):
+    model = MedusaModel.from_pretrained(
+        args.model_path,
+        medusa_num_heads=args.medusa_num_heads,
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True,
+        device_map="auto"
+    )
+    tokenizer = model.get_tokenizer()
+
+
+    data = json.load(open(args.data_path))
+    past_key_values, past_key_values_data, current_length_data = initialize_past_key_values(model.base_model, model.medusa_num_decoder_layers)
+    model.past_key_values = past_key_values
+    model.past_key_values_data = past_key_values_data
+    model.current_length_data = current_length_data
+    results = None
+
+    for sample in tqdm((data)):
+        conv = get_conversation_template("vicuna")
+        conv.messages = []
+        conv.append_message(conv.roles[0], sample["instruction"])
+        conv.append_message(conv.roles[1], "")
+        prompt = conv.get_prompt()
+        steps = args.steps
+        logits_ids = []
+        medusa_topk_ids = []
+
+        with torch.inference_mode():
+            input_ids = tokenizer([prompt]).input_ids
+            input_ids = torch.as_tensor(input_ids).cuda()
+            model.current_length_data.zero_() # this is for rerun
+            reset_medusa_mode(model)
+            medusa_logits, outputs, logits = model(
+                input_ids, past_key_values=past_key_values, output_orig=True
+            )
+            _, medusa_topk = medusa_logits[...,-1,:].topk(20, dim=-1)
+            input_id = logits[:, -1:].argmax(dim=-1)
+            logits_ids.append(input_id.detach().cpu())
+            medusa_topk_ids.append(medusa_topk.detach().cpu())
+            for _ in range(steps):
+                medusa_logits, outputs, logits = model(
+                    input_id, past_key_values=past_key_values, output_orig=True
+                )
+                _, medusa_topk = medusa_logits[...,-1,:].topk(20, dim=-1)
+                input_id = logits[:, -1:].argmax(dim=-1)
+                logits_ids.append(input_id.detach().cpu())
+                medusa_topk_ids.append(medusa_topk.detach().cpu())
+            logits_ids = torch.stack(logits_ids, dim=0)
+            medusa_topk_ids = torch.stack(medusa_topk_ids, dim=0).squeeze(2)
+            if results is None:
+                results = get_accuracies(medusa_topk_ids, logits_ids)
+            else:
+                # cat sub results
+                cur_results = get_accuracies(medusa_topk_ids, logits_ids)
+                for i in range(len(results)):
+                    results[i] = torch.cat((results[i], cur_results[i]), dim=0)
+
+    save_path = os.path.join(args.save_dir, args.model_name + "_heads_accuracy.pt")
+    torch.save(results, save_path)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Medusa Model Evaluator")
+
+    parser.add_argument("--model_path", type=str, required=True,
+                        help="Path to the pre-trained Medusa model.")
+    parser.add_argument("--model_name", type=str, required=True,
+                        help="Name of the model.")
+    parser.add_argument("--medusa_num_heads", type=int, default=5,
+                        help="Number of medusa heads.")
+    parser.add_argument("--data_path", type=str, required=True,
+                        help="Path to the evaluation data in JSON format.")
+    parser.add_argument("--save_dir", type=str, default="../../data",
+                        help="Directory to save the results.")
+    parser.add_argument("--steps", type=int, default=20,
+                        help="Number of steps to run the model.")
+    args = parser.parse_args()
+
+    # If the save directory doesn't exist, create it
+    if not os.path.exists(args.save_dir):
+        os.makedirs(args.save_dir)
+    main(args)
diff --git a/medusa/model/modeling_llama_kv.py b/medusa/model/modeling_llama_kv.py
diff --git a/medusa/model/modeling_llama_kv_legacy.py b/medusa/model/modeling_llama_kv_legacy.py