Skip to content

Commit 30ec177

Browse files
authored
Merge pull request #137 from georgian-io/generate-config
[CLI] Generate Example `config.yml` anywhere via `llmtune generate config`
2 parents e52c773 + e1fa137 commit 30ec177

File tree

6 files changed

+123
-50
lines changed

6 files changed

+123
-50
lines changed

README.md

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,18 +13,21 @@ LLM Finetuning toolkit is a config-based CLI tool for launching a series of LLM
1313
</p>
1414

1515
## Installation
16+
1617
### pipx (recommended)
18+
1719
pipx installs the package and depdencies in a seperate virtual environment
20+
1821
```shell
1922
pipx install llm-toolkit
2023
```
2124

2225
### pip
26+
2327
```shell
2428
pip install llm-toolkit
2529
```
2630

27-
2831
## Quick Start
2932

3033
This guide contains 3 stages that will enable you to get the most out of this toolkit!
@@ -35,11 +38,14 @@ This guide contains 3 stages that will enable you to get the most out of this to
3538

3639
### Basic
3740

38-
```python
39-
llmtune --config-path ./config.yml
41+
```shell
42+
llmtune generate config
43+
llmtune run --config-path ./config.yml
4044
```
4145

42-
This command initiates the fine-tuning process using the settings specified in the default YAML configuration file `config.yaml`.
46+
The first command generates a helpful starter `config.yml` file and saves in the current working directory. This is provided to users to quickly get started and as a base for further modification.
47+
48+
Then the second command initiates the fine-tuning process using the settings specified in the default YAML configuration file `config.yaml`.
4349

4450
### Intermediate
4551

@@ -247,6 +253,7 @@ NOTE: Be sure to merge the latest from "upstream" before making a pull request!
247253
# GPU
248254
docker run -it --gpus all llm-toolkit
249255
```
256+
250257
</details>
251258

252259
<details>
@@ -257,6 +264,7 @@ See poetry documentation page for poetry [installation instructions](https://pyt
257264
```shell
258265
poetry install
259266
```
267+
260268
</details>
261269
<details>
262270
<summary>pip</summary>
@@ -265,27 +273,23 @@ We recommend using a virtual environment like `venv` or `conda` for installation
265273
```shell
266274
pip install -e .
267275
```
276+
268277
</details>
269278
</details>
270279

271-
272-
273280
### Checklist Before Pull Request (Optional)
274281

275282
1. Use `ruff check --fix` to check and fix lint errors
276283
2. Use `ruff format` to apply formatting
277284

278285
NOTE: Ruff linting and formatting checks are done when PR is raised via Git Action. Before raising a PR, it is a good practice to check and fix lint errors, as well as apply formatting.
279286

280-
281287
### Releasing
282288

283-
284-
To manually release a PyPI package, please run:
289+
To manually release a PyPI package, please run:
285290

286291
```shell
287292
make build-release
288293
```
289294

290295
Note: Make sure you have pypi token for this [PyPI repo](https://pypi.org/project/llm-toolkit/).
291-

config.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ training:
6161
warmup_ratio: 0.03
6262
lr_scheduler_type: "constant"
6363
sft_args:
64-
max_seq_length: 5000
64+
max_seq_length: 1024
6565
# neftune_noise_alpha: None
6666

6767
inference:
@@ -80,4 +80,4 @@ qa:
8080
- verb_percent
8181
- adjective_percent
8282
- noun_percent
83-
- summary_length
83+
- summary_length

llmtune/cli/toolkit.py

Lines changed: 35 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,16 @@
11
import logging
2-
from os import listdir
3-
from os.path import exists, join
2+
import shutil
3+
from pathlib import Path
44

55
import torch
66
import typer
77
import yaml
88
from pydantic import ValidationError
99
from transformers import utils as hf_utils
10+
from typing_extensions import Annotated
1011

12+
import llmtune
13+
from llmtune.constants.files import EXAMPLE_CONFIG_FNAME
1114
from llmtune.data.dataset_generator import DatasetGenerator
1215
from llmtune.finetune.lora import LoRAFinetune
1316
from llmtune.inference.lora import LoRAInference
@@ -22,9 +25,16 @@
2225

2326

2427
app = typer.Typer()
28+
generate_app = typer.Typer()
2529

30+
app.add_typer(
31+
generate_app,
32+
name="generate",
33+
help="Generate various artefacts, such as config files",
34+
)
2635

27-
def run_one_experiment(config: Config, config_path: str) -> None:
36+
37+
def run_one_experiment(config: Config, config_path: Path) -> None:
2838
dir_helper = DirectoryHelper(config_path, config)
2939

3040
# Loading Data -------------------------------
@@ -37,7 +47,7 @@ def run_one_experiment(config: Config, config_path: str) -> None:
3747
test_column = dataset_generator.test_column
3848

3949
dataset_path = dir_helper.save_paths.dataset
40-
if not exists(dataset_path):
50+
if not dataset_path.exists():
4151
train, test = dataset_generator.get_dataset()
4252
dataset_generator.save_dataset(dataset_path)
4353
else:
@@ -53,7 +63,7 @@ def run_one_experiment(config: Config, config_path: str) -> None:
5363
weights_path = dir_helper.save_paths.weights
5464

5565
# model_loader = ModelLoader(config, console, dir_helper)
56-
if not exists(weights_path) or not listdir(weights_path):
66+
if not weights_path.exists() or not any(weights_path.iterdir()):
5767
finetuner = LoRAFinetune(config, dir_helper)
5868
with RichUI.during_finetune():
5969
finetuner.finetune(train)
@@ -65,13 +75,13 @@ def run_one_experiment(config: Config, config_path: str) -> None:
6575
# Inference -------------------------------
6676
RichUI.before_inference()
6777
results_path = dir_helper.save_paths.results
68-
results_file_path = join(dir_helper.save_paths.results, "results.csv")
69-
if not exists(results_path) or exists(results_file_path):
78+
results_file_path = dir_helper.save_paths.results_file
79+
if not results_file_path.exists():
7080
inference_runner = LoRAInference(test, test_column, config, dir_helper)
7181
inference_runner.infer_all()
7282
RichUI.after_inference(results_path)
7383
else:
74-
RichUI.inference_found(results_path)
84+
RichUI.results_found(results_path)
7585

7686
# QA -------------------------------
7787
# RichUI.before_qa()
@@ -84,10 +94,11 @@ def run_one_experiment(config: Config, config_path: str) -> None:
8494
# pass
8595

8696

87-
@app.command()
88-
def run(config_path: str = "./config.yml") -> None:
97+
@app.command("run")
98+
def run(config_path: Annotated[str, typer.Argument(help="Path of the config yaml file")] = "./config.yml") -> None:
99+
"""Run the entire exmperiment pipeline"""
89100
# Load YAML config
90-
with open(config_path, "r") as file:
101+
with Path(config_path).open("r") as file:
91102
config = yaml.safe_load(file)
92103
configs = (
93104
generate_permutations(config, Config) if config.get("ablation", {}).get("use_ablate", False) else [config]
@@ -102,12 +113,24 @@ def run(config_path: str = "./config.yml") -> None:
102113
dir_helper = DirectoryHelper(config_path, config)
103114

104115
# Reload config from saved config
105-
with open(join(dir_helper.save_paths.config, "config.yml"), "r") as file:
116+
with dir_helper.save_paths.config_file.open("r") as file:
106117
config = yaml.safe_load(file)
107118
config = Config(**config)
108119

109120
run_one_experiment(config, config_path)
110121

111122

123+
@generate_app.command("config")
124+
def generate_config():
125+
"""
126+
Generate an example `config.yml` file in current directory
127+
"""
128+
module_path = Path(llmtune.__file__).parent
129+
example_config_path = module_path.parent / EXAMPLE_CONFIG_FNAME
130+
destination = Path.cwd()
131+
shutil.copy(example_config_path, destination)
132+
RichUI.generate_config(EXAMPLE_CONFIG_FNAME)
133+
134+
112135
def cli():
113136
app()

llmtune/constants/files.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Example config file
2+
EXAMPLE_CONFIG_FNAME = "config.yml"
3+
4+
# DIRECTORY HELPER - HASH SETTING
5+
NUM_MD5_DIGITS_FOR_SQIDS = 2
6+
7+
# DIRECTORY HELPER - DIRECTORY & FILE NAMES
8+
CONFIG_DIR_NAME = "config"
9+
CONFIG_FILE_NAME = "config.yml"
10+
11+
DATASET_DIR_NAME = "dataset"
12+
13+
WEIGHTS_DIR_NAME = "weights"
14+
15+
RESULTS_DIR_NAME = "results"
16+
RESULTS_FILE_NAME = "results.csv"
17+
18+
QA_DIR_NAME = "qa"
19+
QA_FILE_NAME = "qa_test_results.csv"

llmtune/ui/rich_ui.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -203,3 +203,11 @@ def qa_display_table(self, result_dictionary, mean_values, median_values, stdev_
203203

204204
# Print the table
205205
console.print(table)
206+
207+
"""
208+
GENERATE
209+
"""
210+
211+
@staticmethod
212+
def generate_config(file_name: str):
213+
console.print(f"Generated config at [bold green]./{file_name}[/]")

llmtune/utils/save_utils.py

Lines changed: 45 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,60 +5,79 @@
55
"""
66

77
import hashlib
8-
import os
98
import re
109
from dataclasses import dataclass
1110
from functools import cached_property
12-
from os.path import exists
11+
from pathlib import Path
1312

1413
import yaml
1514
from sqids import Sqids
1615

16+
from llmtune.constants.files import (
17+
CONFIG_DIR_NAME,
18+
CONFIG_FILE_NAME,
19+
DATASET_DIR_NAME,
20+
NUM_MD5_DIGITS_FOR_SQIDS,
21+
QA_DIR_NAME,
22+
QA_FILE_NAME,
23+
RESULTS_DIR_NAME,
24+
RESULTS_FILE_NAME,
25+
WEIGHTS_DIR_NAME,
26+
)
1727
from llmtune.pydantic_models.config_model import Config
1828

1929

20-
NUM_MD5_DIGITS_FOR_SQIDS = 5 # TODO: maybe move consts to a dedicated folder
21-
22-
2330
@dataclass
2431
class DirectoryList:
25-
save_dir: str
32+
save_dir: Path
2633
config_hash: str
2734

2835
@property
29-
def experiment(self) -> str:
30-
return os.path.join(self.save_dir, self.config_hash)
36+
def experiment(self) -> Path:
37+
return self.save_dir / self.config_hash
38+
39+
@property
40+
def config(self) -> Path:
41+
return self.experiment / CONFIG_DIR_NAME
42+
43+
@property
44+
def config_file(self) -> Path:
45+
return self.config / CONFIG_FILE_NAME
46+
47+
@property
48+
def dataset(self) -> Path:
49+
return self.experiment / DATASET_DIR_NAME
3150

3251
@property
33-
def config(self) -> str:
34-
return os.path.join(self.experiment, "config")
52+
def weights(self) -> Path:
53+
return self.experiment / WEIGHTS_DIR_NAME
3554

3655
@property
37-
def dataset(self) -> str:
38-
return os.path.join(self.experiment, "dataset")
56+
def results(self) -> Path:
57+
return self.experiment / RESULTS_DIR_NAME
3958

4059
@property
41-
def weights(self) -> str:
42-
return os.path.join(self.experiment, "weights")
60+
def results_file(self) -> Path:
61+
return self.results / RESULTS_FILE_NAME
4362

4463
@property
45-
def results(self) -> str:
46-
return os.path.join(self.experiment, "results")
64+
def qa(self) -> Path:
65+
return self.experiment / QA_DIR_NAME
4766

4867
@property
49-
def qa(self) -> str:
50-
return os.path.join(self.experiment, "qa")
68+
def qa_file(self) -> Path:
69+
return self.qa / QA_FILE_NAME
5170

5271

5372
class DirectoryHelper:
54-
def __init__(self, config_path: str, config: Config):
55-
self.config_path: str = config_path
73+
def __init__(self, config_path: Path, config: Config):
74+
self.config_path: Path = config_path
5675
self.config: Config = config
5776
self.sqids: Sqids = Sqids()
5877
self.save_paths: DirectoryList = self._get_directory_state()
5978

60-
os.makedirs(self.save_paths.experiment, exist_ok=True)
61-
if not exists(self.save_paths.config):
79+
self.save_paths.experiment.mkdir(parents=True, exist_ok=True)
80+
if not self.save_paths.config.exists():
6281
self.save_config()
6382

6483
@cached_property
@@ -70,15 +89,15 @@ def config_hash(self) -> str:
7089

7190
def _get_directory_state(self) -> DirectoryList:
7291
save_dir = (
73-
self.config.save_dir
92+
Path(self.config.save_dir)
7493
if not self.config.ablation.use_ablate
75-
else os.path.join(self.config.save_dir, self.config.ablation.study_name)
94+
else Path(self.config.save_dir) / self.config.ablation.study_name
7695
)
7796
return DirectoryList(save_dir, self.config_hash)
7897

7998
def save_config(self) -> None:
80-
os.makedirs(self.save_paths.config, exist_ok=True)
99+
self.save_paths.config.mkdir(parents=True, exist_ok=True)
81100
model_dict = self.config.model_dump()
82101

83-
with open(os.path.join(self.save_paths.config, "config.yml"), "w") as file:
102+
with (self.save_paths.config / "config.yml").open("w") as file:
84103
yaml.dump(model_dict, file)

0 commit comments

Comments
 (0)