data generation for self-distillation

leeyeehoo · leeyeehoo · commit ac75468a0b7d · 2023-12-07T19:55:50.000-08:00
diff --git a/data_generation/README.md b/data_generation/README.md
@@ -0,0 +1,27 @@
+# Generate chat data for self-distillation
+We use vLLM to enable batched generation. First, install dependencies:
+```bash
+pip install vllm openai
+```
+
+## Start server
+
+```bash
+python -m vllm.entrypoints.openai.api_server \
+    --model YOUR_MODEL_NAME --port 8000
+```
+You can also start multiple servers with different ports to enable parallel generation. In `generate.py`, we scan the ports from 8000 to 8009 to find available servers. You can modify the code to use other ports.
+
+## Generate data
+The following command will let the model to continue the first prompt from each sample in `DATA_PATH`, this is suitable for models that can play both roles in a conversation (e.g., Zephyr 7B). If you want to use all prompts in each sample to repeatly talk to the model, use `--chat` instead. `--chat` mode works for more models but may take longer time to generate due to repeated computation (welcome to contribute a better implementation).
+
+```bash
+python generate.py --data_path YOUR_DATA_PATH --output_path YOUR_OUTPUT_PATH --num_threads NUM_THREADS --max_tokens YOUR_MAX_TOKENS --temperature YOUR_TEMPERATURE
+```
+
+## (Optional) Format data
+When generated with `--chat`, the output file will follow the ShareGPT format ([example](https://github.com/lm-sys/FastChat/blob/main/data/dummy_conversation.json)).
+You can use the following command to convert the generated text withour `--chat` to the same format:
+```bash
+python convert_to_sharegpt.py --input_path YOUR_INPUT_PATH --model_name YOUR_MODEL_NAME --output_path YOUR_OUTPUT_PATH
+```
diff --git a/data_generation/convert_to_sharegpt.py b/data_generation/convert_to_sharegpt.py
@@ -0,0 +1,70 @@
+import json
+import os
+import time
+import concurrent.futures
+
+import openai
+import shortuuid
+import tqdm
+
+import argparse
+import random
+
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_random_exponential,
+)
+
+from fastchat.conversation import Conversation, SeparatorStyle
+from fastchat.model.model_adapter import get_conversation_template
+from transformers import AutoTokenizer
+
+# Use the same arguments as in generate.py
+parser = argparse.ArgumentParser()
+parser.add_argument("--input_path", type=str)
+parser.add_argument("--model_name", type=str, default="HuggingFaceH4/zephyr-7b-beta")
+args = parser.parse_args()
+
+conv = get_conversation_template(args.model_name)
+tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+
+data = []
+with open(args.input_path) as f:
+    for line in f.readlines():
+        data.append(json.loads(line))
+
+def convert(text):
+    messages = []
+
+    for turn in text.split(conv.roles[0]):
+        pairs = turn.split(conv.roles[1])
+        if len(pairs) != 2:
+            continue
+        messages.append({
+            "from": "human",
+            "value": pairs[0].split(conv.sep)[0].strip()
+        })
+        messages.append({
+            "from": "gpt",
+            "value": pairs[1].split(conv.sep)[0].strip()
+        })
+    # pop the last message because it might be incomplete
+    if len(messages) > 0:
+        messages.pop()
+    # make sure number of messages is even
+    if len(messages) % 2 == 1:
+        messages.pop()
+    return {"conversations": messages}
+
+sharegpt_data = []
+for d in tqdm.tqdm(data):
+    sample = convert(d["text"])
+    if len(sample["conversations"]) < 2:
+        continue
+    sharegpt_data.append(sample)
+
+# dump to jsonl
+with open(args.input_path.replace(".jsonl", "_sharegpt.jsonl"), "w") as f:
+    for d in sharegpt_data:
+        f.write(json.dumps(d) + "\n")
diff --git a/data_generation/generate.py b/data_generation/generate.py
@@ -0,0 +1,156 @@
+import json
+import os
+import time
+import concurrent.futures
+
+import openai
+import shortuuid
+import tqdm
+
+import argparse
+import random
+
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_random_exponential,
+)
+
+from fastchat.conversation import Conversation, SeparatorStyle
+from fastchat.model.model_adapter import get_conversation_template
+
+# Modify OpenAI's API key and API base to use vLLM's API server.
+openai.api_key = "EMPTY"
+openai.api_base = "http://localhost:8000/v1"
+
+api_base_pool = []
+
+# List models API
+for i in range(10):
+    openai.api_base = "http://localhost:800{}/v1".format(i)
+    try:     
+        models = openai.Model.list()["data"][0]["id"]
+        print(openai.api_base, models)
+        api_base_pool.append(openai.api_base)
+    except:
+        break
+
+print("API base pool: ", api_base_pool)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--data_path", type=str)
+parser.add_argument("--output_path", type=str)
+parser.add_argument("--num_threads", type=int, default=256)
+parser.add_argument("--temperature", type=float, default=0.3)
+parser.add_argument("--max_tokens", type=int, default=2048)
+parser.add_argument("--chat", action="store_true")
+args = parser.parse_args()
+
+# Assuming the ShareGPT format
+data = json.load(open(args.data_path, "r"))
+
+def generate_data(messages, idx):
+    try:
+        # load balanced
+        openai.api_base = api_base_pool[idx % len(api_base_pool)]
+        model_name=openai.Model.list()["data"][0]["id"]
+
+        if args.chat:
+            converted_messages = []
+            output_messages = []
+            if messages[0]["from"] == "system":
+                converted_messages.append(
+                    {
+                        "role": "system",
+                        "content": messages[0]["text"],
+                    }
+                )
+                output_messages.append(messages[0])
+                messages = messages[1:]
+            for message in messages[::2]:
+                if message["from"] != "human":
+                    return
+                converted_messages.append(
+                    {
+                        "role": "user",
+                        "content": message["value"],
+                    }
+                )
+                try:
+                    response = openai.ChatCompletion.create(
+                        model=model_name,
+                        messages=converted_messages,
+                        max_tokens=args.max_tokens,
+                        temperature=args.temperature,
+                    )
+                    if response.choices[0]['finish_reason'] == "length":
+                        break
+                    response = response.choices[0]['message']['content'].strip()
+                    output_messages.append(message)
+                    output_messages.append(
+                        {
+                            "from": "gpt",
+                            "value": response,
+                        }
+                    )
+                    converted_messages.append(
+                        {
+                            "role": "assistant",
+                            "content": response,
+                        }
+                    )
+                except:
+                    break
+            if len(output_messages) == 0:
+                return
+            with open(args.output_path, "a") as f:
+                # write in share gpt format
+                f.write(json.dumps({"conversations": output_messages}) + "\n")
+        else:
+            conv = get_conversation_template(model_name)
+            if messages[0]["from"] == "system":
+                conv.system_message = messages[0]["text"]
+                messages = messages[1:]
+            conv.append_message(conv.roles[0], messages[0]["value"])
+            conv.append_message(conv.roles[1], None)
+            prompt = conv.get_prompt()
+
+            response = openai.Completion.create(
+                model=model_name,
+                prompt=prompt,
+                max_tokens=args.max_tokens,
+                temperature=args.temperature,
+                ignore_eos=True,
+                skip_special_tokens=False,
+                spaces_between_special_tokens=False,
+            )
+            response = response.choices[0]['text'].strip()
+            with open(args.output_path, "a") as f:
+                # write in share gpt format
+                f.write(json.dumps({"text": prompt+response}) + "\n")
+    except Exception as e:
+        print(e)
+        print(prompt)
+        print("Failed to generate data")
+
+# if output_path exists, count the number of lines and skip the first n data
+start = 0
+if os.path.exists(args.output_path):
+    with open(args.output_path, "r") as f:
+        start = len(f.readlines())
+        print("Skip first {} data".format(start))
+
+with concurrent.futures.ThreadPoolExecutor(max_workers=args.num_threads) as executor:
+        futures = []
+        for idx, sample in enumerate(data[start:]):
+            future = executor.submit(
+                generate_data,
+                sample["conversations"],
+                idx,
+            )
+            futures.append(future)
+
+        for future in tqdm.tqdm(
+            concurrent.futures.as_completed(futures), total=len(futures)
+        ):
+            future.result()