Dolly V2 Updates (#88)

matthayes · web-flow · commit 905e58a1ef9f · 2023-04-15T16:49:21.000-07:00
This updates training to use the [`databricks-dolly-15k`](https://github.com/databrickslabs/dolly/tree/master/data) dataset. It also includes improvements to text generation and example notebooks. Key Changes: * The `train_dolly.py` notebook now uses Pythia models as the input models and fine tunes using the [`databricks-dolly-15k`](https://github.com/databrickslabs/dolly/tree/master/data) dataset. * Added `InstructionTextGenerationPipeline` for text generation. This is derived from the code in the model repo, [instruct_pipeline.py](https://huggingface.co/databricks/dolly-v2-12b/blob/main/instruct_pipeline.py). It has been improved so that it is compatible with the `TextGenerationPipeline` from the `transformers` library. Some code, such as that in `_forward`, was copied from that pipeline to help with compatibility. The biggest change relative to the current `instruct_pipeline.py` version is that it returns a list of dicts per instruction, rather than just a dict. It also now has a `return_full_text` option. Both of these contribute towards being usable with `langchain`. * `generate_response` is now a wrapper around `InstructionTextGenerationPipeline`, as the code was all moved there. * `trainer.py` now uses the local `databricks-dolly-15k.jsonl` dataset. A `text` column has been constructed from the instruction, context, and response. Minor Changes: * Added an `experiment_id` widget to help keep track of different models that are fine tuned. * Added more options to CLI for configuring training. Additional Changes: * Added a `generation.py` example notebook that uses `generate_response` on a couple instructions. * Added a `langchain.py` example notebook that uses `HuggingFacePipeline ` from `langchain` and `InstructionTextGenerationPipeline ` to test instructions both with and without context. * Added a `pipeline.py` example notebook that uses `InstructionTextGenerationPipeline` to generate multiple samples per instruction.
diff --git a/examples/generation.py b/examples/generation.py
@@ -0,0 +1,52 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC ## Generation Example
+# MAGIC
+# MAGIC This takes a pretrained Dolly model, either from Hugging face or from a local path, and runs generation with it
+# MAGIC using the code from this repo.
+# MAGIC
+# MAGIC The model to load for generation is controlled by `input_model`.  The default options are the pretrained
+# MAGIC Dolly models shared on Hugging Face.  Alternatively, the path to a local model that has been trained using the
+# MAGIC `train_dolly` notebook can also be used.
+
+# COMMAND ----------
+
+# MAGIC %pip install -r ../requirements.txt
+
+# COMMAND ----------
+
+# MAGIC %load_ext autoreload
+# MAGIC %autoreload 2
+
+default_model = "databricks/dolly-v2-3b"
+
+suggested_models = [
+    "databricks/dolly-v1-6b",
+    "databricks/dolly-v2-3b",
+    "databricks/dolly-v2-7b",
+    "databricks/dolly-v2-12b",
+]
+
+dbutils.widgets.combobox("input_model", default_model, suggested_models, "input_model")
+
+# COMMAND ----------
+
+from training.generate import generate_response, load_model_tokenizer_for_generate
+
+input_model = dbutils.widgets.get("input_model")
+
+model, tokenizer = load_model_tokenizer_for_generate(input_model)
+
+# COMMAND ----------
+
+# Examples from https://www.databricks.com/blog/2023/03/24/hello-dolly-democratizing-magic-chatgpt-open-models.html
+instructions = [
+    "Explain to me the difference between nuclear fission and fusion.",
+    "Give me a list of 5 science fiction books I should read next.",
+]
+
+# Use the model to generate responses for each of the instructions above.
+for instruction in instructions:
+    response = generate_response(instruction, model=model, tokenizer=tokenizer)
+    if response:
+        print(f"Instruction: {instruction}\n\n{response}\n\n-----------\n")
diff --git a/examples/langchain.py b/examples/langchain.py
@@ -0,0 +1,92 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC ## Langchain Example
+# MAGIC
+# MAGIC This takes a pretrained Dolly model, either from Hugging face or from a local path, and uses langchain
+# MAGIC to run generation.
+# MAGIC
+# MAGIC The model to load for generation is controlled by `input_model`.  The default options are the pretrained
+# MAGIC Dolly models shared on Hugging Face.  Alternatively, the path to a local model that has been trained using the
+# MAGIC `train_dolly` notebook can also be used.
+
+# COMMAND ----------
+
+# MAGIC %pip install -r ../requirements.txt
+
+# COMMAND ----------
+
+# MAGIC %load_ext autoreload
+# MAGIC %autoreload 2
+
+# COMMAND ----------
+
+default_model = "databricks/dolly-v2-3b"
+
+suggested_models = [
+    "databricks/dolly-v1-6b",
+    "databricks/dolly-v2-3b",
+    "databricks/dolly-v2-7b",
+    "databricks/dolly-v2-12b",
+]
+
+dbutils.widgets.combobox("input_model", default_model, suggested_models, "input_model")
+
+# COMMAND ----------
+
+from training.generate import InstructionTextGenerationPipeline, load_model_tokenizer_for_generate
+
+input_model = dbutils.widgets.get("input_model")
+
+model, tokenizer = load_model_tokenizer_for_generate(input_model)
+
+# COMMAND ----------
+
+from langchain import PromptTemplate, LLMChain
+from langchain.llms import HuggingFacePipeline
+
+# template for an instrution with no input
+prompt = PromptTemplate(
+    input_variables=["instruction"],
+    template="{instruction}")
+
+# template for an instruction with input
+prompt_with_context = PromptTemplate(
+    input_variables=["instruction", "context"],
+    template="{instruction}\n\nInput:\n{context}")
+
+hf_pipeline = HuggingFacePipeline(
+    pipeline=InstructionTextGenerationPipeline(
+        # Return the full text, because this is what the HuggingFacePipeline expects.
+        model=model, tokenizer=tokenizer, return_full_text=True, task="text-generation"))
+
+llm_chain = LLMChain(llm=hf_pipeline, prompt=prompt)
+llm_context_chain = LLMChain(llm=hf_pipeline, prompt=prompt_with_context)
+
+# COMMAND ----------
+
+# Examples from https://www.databricks.com/blog/2023/03/24/hello-dolly-democratizing-magic-chatgpt-open-models.html
+instructions = [
+    "Explain to me the difference between nuclear fission and fusion.",
+    "Give me a list of 5 science fiction books I should read next.",
+]
+
+# Use the model to generate responses for each of the instructions above.
+for instruction in instructions:
+    response = llm_chain.predict(instruction=instruction)
+    print(f"Instruction: {instruction}\n\n{response}\n\n-----------\n")
+
+# COMMAND ----------
+
+context = (
+    """George Washington (February 22, 1732[b] – December 14, 1799) was an American military officer, statesman, """
+    """and Founding Father who served as the first president of the United States from 1789 to 1797. Appointed by """
+    """the Continental Congress as commander of the Continental Army, Washington led Patriot forces to victory in """
+    """the American Revolutionary War and served as president of the Constitutional Convention of 1787, which """
+    """created and ratified the Constitution of the United States and the American federal government. Washington """
+    """has been called the "Father of his Country" for his manifold leadership in the nation's founding."""
+)
+
+instruction = "When did George Washinton serve as president of the Constitutional Convention?"
+
+response = llm_context_chain.predict(instruction=instruction, context=context)
+print(f"Instruction: {instruction}\n\nContext:\n{context}\n\nResponse:\n{response}\n\n-----------\n")
diff --git a/examples/pipeline.py b/examples/pipeline.py
@@ -0,0 +1,58 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC ## Pipeline Example
+# MAGIC
+# MAGIC This takes a pretrained Dolly model, either from Hugging face or from a local path, and uses the pipeline from
+# MAGIC this repo to perform generation.
+# MAGIC
+# MAGIC The model to load for generation is controlled by `input_model`.  The default options are the pretrained
+# MAGIC Dolly models shared on Hugging Face.  Alternatively, the path to a local model that has been trained using the
+# MAGIC `train_dolly` notebook can also be used.
+
+# COMMAND ----------
+
+# MAGIC %pip install -r ../requirements.txt
+
+# COMMAND ----------
+
+# MAGIC %load_ext autoreload
+# MAGIC %autoreload 2
+
+default_model = "databricks/dolly-v2-3b"
+
+suggested_models = [
+    "databricks/dolly-v1-6b",
+    "databricks/dolly-v2-3b",
+    "databricks/dolly-v2-7b",
+    "databricks/dolly-v2-12b",
+]
+
+dbutils.widgets.combobox("input_model", default_model, suggested_models, "input_model")
+
+# COMMAND ----------
+
+from training.generate import InstructionTextGenerationPipeline, load_model_tokenizer_for_generate
+
+input_model = dbutils.widgets.get("input_model")
+
+model, tokenizer = load_model_tokenizer_for_generate(input_model)
+
+# COMMAND ----------
+
+generation_pipeline = InstructionTextGenerationPipeline(model=model, tokenizer=tokenizer)
+
+# Examples from https://www.databricks.com/blog/2023/03/24/hello-dolly-democratizing-magic-chatgpt-open-models.html
+instructions = [
+    "Explain to me the difference between nuclear fission and fusion.",
+    "Give me a list of 5 science fiction books I should read next.",
+]
+
+# Use the model to generate responses for each of the instructions above.
+for instruction in instructions:
+    results = generation_pipeline(instruction, num_return_sequences=2)
+
+    print(f"Instruction: {instruction}\n")
+    for i, res in enumerate(results, 1):
+        text = res["generated_text"]
+        print(f"Sample #{i}:\n{text}\n")
+    print("-----------\n")
diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,4 @@ click==8.0.3
 datasets==2.8.0
 deepspeed==0.8.0
 transformers[torch]==4.25.1
-watchdog==2.1.9
+langchain>=0.0.139
diff --git a/train_dolly.py b/train_dolly.py
@@ -2,8 +2,11 @@
 # MAGIC %md
 # MAGIC ## Train Dolly
 # MAGIC
-# MAGIC This fine-tunes the [GPT-J 6B](https://huggingface.co/EleutherAI/gpt-j-6B) model on
-# MAGIC the [Alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca) dataset.
+# MAGIC This fine-tunes EleutherAI Pythia models
+# MAGIC (e.g. [pythia-2.8b](https://huggingface.co/EleutherAI/pythia-6.9b),
+# MAGIC [pythia-6.9b](https://huggingface.co/EleutherAI/pythia-6.9b), or
+# MAGIC [pythia-12b](https://huggingface.co/EleutherAI/pythia-12b)) on
+# MAGIC the [databricks-dolly-15k](https://github.com/databrickslabs/dolly/tree/master/data) dataset.
 # MAGIC
 # MAGIC ```
 # MAGIC   Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,8 +22,10 @@
 # MAGIC   limitations under the License.
 # MAGIC ```
 # MAGIC
-# MAGIC Please note that while GPT-J 6B is [Apache 2.0 licensed](https://huggingface.co/EleutherAI/gpt-j-6B),
-# MAGIC the Alpaca dataset is licensed under [Creative Commons NonCommercial (CC BY-NC 4.0)](https://huggingface.co/datasets/tatsu-lab/alpaca).
+# MAGIC The EleutherAI Pythia models are [Apache 2.0 licensed](https://huggingface.co/EleutherAI/gpt-j-6B) and
+# MAGIC the [databricks-dolly-15k](https://github.com/databrickslabs/dolly/tree/master/data) is licensed under the terms
+# MAGIC of [Creative Commons Attribution-ShareAlike 3.0 Unported License](https://creativecommons.org/licenses/by-sa/3.0/legalcode),
+# MAGIC which means it can be used for either academic or commercial purposes.
 
 # COMMAND ----------
 
@@ -55,12 +60,16 @@
 # COMMAND ----------
 
 import os
+import re
 from datetime import datetime
+from training.consts import DEFAULT_INPUT_MODEL, SUGGESTED_INPUT_MODELS
 from training.trainer import load_training_dataset, load_tokenizer
 
+dbutils.widgets.combobox("input_model", DEFAULT_INPUT_MODEL, SUGGESTED_INPUT_MODELS, "input_model")
 dbutils.widgets.text("num_gpus", "", "num_gpus")
 dbutils.widgets.text("local_training_root", "", "local_training_root")
 dbutils.widgets.text("dbfs_output_root", "", "dbfs_output_root")
+dbutils.widgets.text("experiment_id", "", "experiment_id")
 
 # COMMAND ----------
 
@@ -72,6 +81,14 @@
 
 timestamp = datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
 model_name = "dolly"
+
+experiment_id = dbutils.widgets.get("experiment_id")
+input_model = dbutils.widgets.get("input_model")
+
+if experiment_id:
+    experiment_id = re.sub(r"\s+", "_", experiment_id.strip())
+    model_name = f"{model_name}__{experiment_id}"
+
 checkpoint_dir_name = f"{model_name}__{timestamp}"
 
 root_path = os.getcwd()
@@ -122,13 +139,20 @@
 
 # MAGIC !deepspeed {num_gpus_flag} \
 # MAGIC     --module training.trainer \
+# MAGIC     --input-model {input_model} \
 # MAGIC     --deepspeed {deepspeed_config} \
-# MAGIC     --epochs 1 \
+# MAGIC     --epochs 2 \
 # MAGIC     --local-output-dir {local_output_dir} \
 # MAGIC     --dbfs-output-dir {dbfs_output_dir} \
-# MAGIC     --per-device-train-batch-size 8 \
-# MAGIC     --per-device-eval-batch-size 8 \
-# MAGIC     --lr 1e-5
+# MAGIC     --per-device-train-batch-size 6 \
+# MAGIC     --per-device-eval-batch-size 6 \
+# MAGIC     --logging-steps 10 \
+# MAGIC     --save-steps 200 \
+# MAGIC     --save-total-limit 20 \
+# MAGIC     --eval-steps 50 \
+# MAGIC     --warmup-steps 50 \
+# MAGIC     --test-size 200 \
+# MAGIC     --lr 5e-6
 
 # COMMAND ----------
 
diff --git a/training/consts.py b/training/consts.py
@@ -1,18 +1,74 @@
-DEFAULT_TRAINING_DATASET = "tatsu-lab/alpaca"
-DEFAULT_INPUT_MODEL = "EleutherAI/gpt-j-6B"
-END_KEY = "### End"
+DEFAULT_INPUT_MODEL = "EleutherAI/pythia-6.9b"
+SUGGESTED_INPUT_MODELS = [
+    "EleutherAI/pythia-2.8b",
+    "EleutherAI/pythia-6.9b",
+    "EleutherAI/pythia-12b",
+    "EleutherAI/gpt-j-6B",
+]
+INTRO_BLURB = (
+    "Below is an instruction that describes a task. Write a response that appropriately completes the request."
+)
 INSTRUCTION_KEY = "### Instruction:"
-RESPONSE_KEY_NL = f"### Response:\n"
+INPUT_KEY = "Input:"
+RESPONSE_KEY = "### Response:"
+END_KEY = "### End"
+RESPONSE_KEY_NL = f"{RESPONSE_KEY}\n"
 DEFAULT_SEED = 42
 
-# The format of the instruction the model has been trained on.
-PROMPT_FORMAT = """%s
+# This is a training prompt that does not contain an input string.  The instruction by itself has enough information
+# to respond.  For example, the instruction might ask for the year a historic figure was born.
+PROMPT_NO_INPUT_FORMAT = """{intro}
 
-%s
+{instruction_key}
 {instruction}
 
-%s""" % (
-    "Below is an instruction that describes a task. Write a response that appropriately completes the request.",
-    INSTRUCTION_KEY,
-    RESPONSE_KEY_NL,
+{response_key}
+{response}
+
+{end_key}""".format(
+    intro=INTRO_BLURB,
+    instruction_key=INSTRUCTION_KEY,
+    instruction="{instruction}",
+    response_key=RESPONSE_KEY,
+    response="{response}",
+    end_key=END_KEY,
 )
+
+# This is a training prompt that contains an input string that serves as context for the instruction.  For example,
+# the input might be a passage from Wikipedia and the intruction is to extract some information from it.
+PROMPT_WITH_INPUT_FORMAT = """{intro}
+
+{instruction_key}
+{instruction}
+
+{input_key}
+{input}
+
+{response_key}
+{response}
+
+{end_key}""".format(
+    intro=INTRO_BLURB,
+    instruction_key=INSTRUCTION_KEY,
+    instruction="{instruction}",
+    input_key=INPUT_KEY,
+    input="{input}",
+    response_key=RESPONSE_KEY,
+    response="{response}",
+    end_key=END_KEY,
+)
+
+# This is the prompt that is used for generating responses using an already trained model.  It ends with the response
+# key, where the job of the model is to provide the completion that follows it (i.e. the response itself).
+PROMPT_FOR_GENERATION_FORMAT = """{intro}
+
+{instruction_key}
+{instruction}
+
+{response_key}
+""".format(
+    intro=INTRO_BLURB,
+    instruction_key=INSTRUCTION_KEY,
+    instruction="{instruction}",
+    response_key=RESPONSE_KEY,
+)
diff --git a/training/generate.py b/training/generate.py
diff --git a/training/trainer.py b/training/trainer.py