Merge pull request #137 from georgian-io/generate-config

benjaminye · web-flow · commit 30ec1770ba7c · 2024-04-09T10:54:54.000-04:00
[CLI] Generate Example `config.yml` anywhere via `llmtune generate config`
diff --git a/README.md b/README.md
@@ -13,18 +13,21 @@ LLM Finetuning toolkit is a config-based CLI tool for launching a series of LLM
 </p>
 
 ## Installation
+
 ### pipx (recommended)
+
 pipx installs the package and depdencies in a seperate virtual environment
+
 ```shell
 pipx install llm-toolkit
 ```
 
 ### pip
+
 ```shell
 pip install llm-toolkit
 ```
 
-
 ## Quick Start
 
 This guide contains 3 stages that will enable you to get the most out of this toolkit!
@@ -35,11 +38,14 @@ This guide contains 3 stages that will enable you to get the most out of this to
 
 ### Basic
 
-```python
-   llmtune --config-path ./config.yml
+```shell
+   llmtune generate config
+   llmtune run --config-path ./config.yml
 ```
 
-This command initiates the fine-tuning process using the settings specified in the default YAML configuration file `config.yaml`.
+The first command generates a helpful starter `config.yml` file and saves in the current working directory. This is provided to users to quickly get started and as a base for further modification.
+
+Then the second command initiates the fine-tuning process using the settings specified in the default YAML configuration file `config.yaml`.
 
 ### Intermediate
 
@@ -247,6 +253,7 @@ NOTE: Be sure to merge the latest from "upstream" before making a pull request!
    # GPU
    docker run -it --gpus all llm-toolkit
 ```
+
 </details>
 
 <details>
@@ -257,6 +264,7 @@ See poetry documentation page for poetry [installation instructions](https://pyt
 ```shell
    poetry install
 ```
+
 </details>
 <details>
 <summary>pip</summary>
@@ -265,27 +273,23 @@ We recommend using a virtual environment like `venv` or `conda` for installation
 ```shell
    pip install -e .
 ```
+
 </details>
 </details>
 
-
-
 ### Checklist Before Pull Request (Optional)
 
 1. Use `ruff check --fix` to check and fix lint errors
 2. Use `ruff format` to apply formatting
 
 NOTE: Ruff linting and formatting checks are done when PR is raised via Git Action. Before raising a PR, it is a good practice to check and fix lint errors, as well as apply formatting.
 
-
 ### Releasing
 
-
-To manually release a PyPI package, please run: 
+To manually release a PyPI package, please run:
 
 ```shell
    make build-release
 ```
 
 Note: Make sure you have pypi token for this [PyPI repo](https://pypi.org/project/llm-toolkit/).
-
diff --git a/config.yml b/config.yml
@@ -61,7 +61,7 @@ training:
     warmup_ratio: 0.03
     lr_scheduler_type: "constant"
   sft_args:
-    max_seq_length: 5000
+    max_seq_length: 1024
     # neftune_noise_alpha: None
 
 inference:
@@ -80,4 +80,4 @@ qa:
     - verb_percent
     - adjective_percent
     - noun_percent
-    - summary_length
+    - summary_length
diff --git a/llmtune/cli/toolkit.py b/llmtune/cli/toolkit.py
@@ -1,13 +1,16 @@
 import logging
-from os import listdir
-from os.path import exists, join
+import shutil
+from pathlib import Path
 
 import torch
 import typer
 import yaml
 from pydantic import ValidationError
 from transformers import utils as hf_utils
+from typing_extensions import Annotated
 
+import llmtune
+from llmtune.constants.files import EXAMPLE_CONFIG_FNAME
 from llmtune.data.dataset_generator import DatasetGenerator
 from llmtune.finetune.lora import LoRAFinetune
 from llmtune.inference.lora import LoRAInference
@@ -22,9 +25,16 @@
 
 
 app = typer.Typer()
+generate_app = typer.Typer()
 
+app.add_typer(
+    generate_app,
+    name="generate",
+    help="Generate various artefacts, such as config files",
+)
 
-def run_one_experiment(config: Config, config_path: str) -> None:
+
+def run_one_experiment(config: Config, config_path: Path) -> None:
     dir_helper = DirectoryHelper(config_path, config)
 
     # Loading Data -------------------------------
@@ -37,7 +47,7 @@ def run_one_experiment(config: Config, config_path: str) -> None:
     test_column = dataset_generator.test_column
 
     dataset_path = dir_helper.save_paths.dataset
-    if not exists(dataset_path):
+    if not dataset_path.exists():
         train, test = dataset_generator.get_dataset()
         dataset_generator.save_dataset(dataset_path)
     else:
@@ -53,7 +63,7 @@ def run_one_experiment(config: Config, config_path: str) -> None:
     weights_path = dir_helper.save_paths.weights
 
     # model_loader = ModelLoader(config, console, dir_helper)
-    if not exists(weights_path) or not listdir(weights_path):
+    if not weights_path.exists() or not any(weights_path.iterdir()):
         finetuner = LoRAFinetune(config, dir_helper)
         with RichUI.during_finetune():
             finetuner.finetune(train)
@@ -65,13 +75,13 @@ def run_one_experiment(config: Config, config_path: str) -> None:
     # Inference -------------------------------
     RichUI.before_inference()
     results_path = dir_helper.save_paths.results
-    results_file_path = join(dir_helper.save_paths.results, "results.csv")
-    if not exists(results_path) or exists(results_file_path):
+    results_file_path = dir_helper.save_paths.results_file
+    if not results_file_path.exists():
         inference_runner = LoRAInference(test, test_column, config, dir_helper)
         inference_runner.infer_all()
         RichUI.after_inference(results_path)
     else:
-        RichUI.inference_found(results_path)
+        RichUI.results_found(results_path)
 
     # QA -------------------------------
     # RichUI.before_qa()
@@ -84,10 +94,11 @@ def run_one_experiment(config: Config, config_path: str) -> None:
     #     pass
 
 
-@app.command()
-def run(config_path: str = "./config.yml") -> None:
+@app.command("run")
+def run(config_path: Annotated[str, typer.Argument(help="Path of the config yaml file")] = "./config.yml") -> None:
+    """Run the entire exmperiment pipeline"""
     # Load YAML config
-    with open(config_path, "r") as file:
+    with Path(config_path).open("r") as file:
         config = yaml.safe_load(file)
         configs = (
             generate_permutations(config, Config) if config.get("ablation", {}).get("use_ablate", False) else [config]
@@ -102,12 +113,24 @@ def run(config_path: str = "./config.yml") -> None:
         dir_helper = DirectoryHelper(config_path, config)
 
         # Reload config from saved config
-        with open(join(dir_helper.save_paths.config, "config.yml"), "r") as file:
+        with dir_helper.save_paths.config_file.open("r") as file:
             config = yaml.safe_load(file)
             config = Config(**config)
 
         run_one_experiment(config, config_path)
 
 
+@generate_app.command("config")
+def generate_config():
+    """
+    Generate an example `config.yml` file in current directory
+    """
+    module_path = Path(llmtune.__file__).parent
+    example_config_path = module_path.parent / EXAMPLE_CONFIG_FNAME
+    destination = Path.cwd()
+    shutil.copy(example_config_path, destination)
+    RichUI.generate_config(EXAMPLE_CONFIG_FNAME)
+
+
 def cli():
     app()
diff --git a/llmtune/constants/files.py b/llmtune/constants/files.py
@@ -0,0 +1,19 @@
+# Example config file
+EXAMPLE_CONFIG_FNAME = "config.yml"
+
+# DIRECTORY HELPER - HASH SETTING
+NUM_MD5_DIGITS_FOR_SQIDS = 2
+
+# DIRECTORY HELPER - DIRECTORY & FILE NAMES
+CONFIG_DIR_NAME = "config"
+CONFIG_FILE_NAME = "config.yml"
+
+DATASET_DIR_NAME = "dataset"
+
+WEIGHTS_DIR_NAME = "weights"
+
+RESULTS_DIR_NAME = "results"
+RESULTS_FILE_NAME = "results.csv"
+
+QA_DIR_NAME = "qa"
+QA_FILE_NAME = "qa_test_results.csv"
diff --git a/llmtune/ui/rich_ui.py b/llmtune/ui/rich_ui.py
@@ -203,3 +203,11 @@ def qa_display_table(self, result_dictionary, mean_values, median_values, stdev_
 
         # Print the table
         console.print(table)
+
+    """
+    GENERATE
+    """
+
+    @staticmethod
+    def generate_config(file_name: str):
+        console.print(f"Generated config at [bold green]./{file_name}[/]")
diff --git a/llmtune/utils/save_utils.py b/llmtune/utils/save_utils.py
@@ -5,60 +5,79 @@
 """
 
 import hashlib
-import os
 import re
 from dataclasses import dataclass
 from functools import cached_property
-from os.path import exists
+from pathlib import Path
 
 import yaml
 from sqids import Sqids
 
+from llmtune.constants.files import (
+    CONFIG_DIR_NAME,
+    CONFIG_FILE_NAME,
+    DATASET_DIR_NAME,
+    NUM_MD5_DIGITS_FOR_SQIDS,
+    QA_DIR_NAME,
+    QA_FILE_NAME,
+    RESULTS_DIR_NAME,
+    RESULTS_FILE_NAME,
+    WEIGHTS_DIR_NAME,
+)
 from llmtune.pydantic_models.config_model import Config
 
 
-NUM_MD5_DIGITS_FOR_SQIDS = 5  # TODO: maybe move consts to a dedicated folder
-
-
 @dataclass
 class DirectoryList:
-    save_dir: str
+    save_dir: Path
     config_hash: str
 
     @property
-    def experiment(self) -> str:
-        return os.path.join(self.save_dir, self.config_hash)
+    def experiment(self) -> Path:
+        return self.save_dir / self.config_hash
+
+    @property
+    def config(self) -> Path:
+        return self.experiment / CONFIG_DIR_NAME
+
+    @property
+    def config_file(self) -> Path:
+        return self.config / CONFIG_FILE_NAME
+
+    @property
+    def dataset(self) -> Path:
+        return self.experiment / DATASET_DIR_NAME
 
     @property
-    def config(self) -> str:
-        return os.path.join(self.experiment, "config")
+    def weights(self) -> Path:
+        return self.experiment / WEIGHTS_DIR_NAME
 
     @property
-    def dataset(self) -> str:
-        return os.path.join(self.experiment, "dataset")
+    def results(self) -> Path:
+        return self.experiment / RESULTS_DIR_NAME
 
     @property
-    def weights(self) -> str:
-        return os.path.join(self.experiment, "weights")
+    def results_file(self) -> Path:
+        return self.results / RESULTS_FILE_NAME
 
     @property
-    def results(self) -> str:
-        return os.path.join(self.experiment, "results")
+    def qa(self) -> Path:
+        return self.experiment / QA_DIR_NAME
 
     @property
-    def qa(self) -> str:
-        return os.path.join(self.experiment, "qa")
+    def qa_file(self) -> Path:
+        return self.qa / QA_FILE_NAME
 
 
 class DirectoryHelper:
-    def __init__(self, config_path: str, config: Config):
-        self.config_path: str = config_path
+    def __init__(self, config_path: Path, config: Config):
+        self.config_path: Path = config_path
         self.config: Config = config
         self.sqids: Sqids = Sqids()
         self.save_paths: DirectoryList = self._get_directory_state()
 
-        os.makedirs(self.save_paths.experiment, exist_ok=True)
-        if not exists(self.save_paths.config):
+        self.save_paths.experiment.mkdir(parents=True, exist_ok=True)
+        if not self.save_paths.config.exists():
             self.save_config()
 
     @cached_property
@@ -70,15 +89,15 @@ def config_hash(self) -> str:
 
     def _get_directory_state(self) -> DirectoryList:
         save_dir = (
-            self.config.save_dir
+            Path(self.config.save_dir)
             if not self.config.ablation.use_ablate
-            else os.path.join(self.config.save_dir, self.config.ablation.study_name)
+            else Path(self.config.save_dir) / self.config.ablation.study_name
         )
         return DirectoryList(save_dir, self.config_hash)
 
     def save_config(self) -> None:
-        os.makedirs(self.save_paths.config, exist_ok=True)
+        self.save_paths.config.mkdir(parents=True, exist_ok=True)
         model_dict = self.config.model_dump()
 
-        with open(os.path.join(self.save_paths.config, "config.yml"), "w") as file:
+        with (self.save_paths.config / "config.yml").open("w") as file:
             yaml.dump(model_dict, file)