georgian-io
diff --git a/‎README.md‎
Lines changed: 39 additions & 11 deletions b/‎README.md‎
Lines changed: 39 additions & 11 deletions
diff --git a/‎llmtune/cli/toolkit.py‎
Lines changed: 39 additions & 12 deletions b/‎llmtune/cli/toolkit.py‎
Lines changed: 39 additions & 12 deletions
diff --git a/‎config.yml‎ renamed to ‎llmtune/config.yml‎
Lines changed: 11 additions & 8 deletions b/‎config.yml‎ renamed to ‎llmtune/config.yml‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎llmtune/constants/files.py‎
Lines changed: 19 additions & 0 deletions b/‎llmtune/constants/files.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎llmtune/data/ingestor.py‎
Lines changed: 16 additions & 1 deletion b/‎llmtune/data/ingestor.py‎
Lines changed: 16 additions & 1 deletion
@@ -13,38 +13,68 @@ LLM Finetuning toolkit is a config-based CLI tool for launching a series of LLM
 </p>
 
 ## Installation
+
 ### pipx (recommended)
+
 pipx installs the package and depdencies in a seperate virtual environment
+
 ```shell
 pipx install llm-toolkit
 ```
 
 ### pip
+
 ```shell
 pip install llm-toolkit
 ```
 
-
 ## Quick Start
 
 This guide contains 3 stages that will enable you to get the most out of this toolkit!
 
 - **Basic**: Run your first LLM fine-tuning experiment
-- **Intermediate**: Run a custom experiment by changing the componenets of the YAML configuration file
+- **Intermediate**: Run a custom experiment by changing the components of the YAML configuration file
 - **Advanced**: Launch series of fine-tuning experiments across different prompt templates, LLMs, optimization techniques -- all through **one** YAML configuration file
 
 ### Basic
 
-```python
-   llmtune --config-path ./config.yml
+```shell
+   llmtune generate config
+   llmtune run ./config.yml
 ```
 
-This command initiates the fine-tuning process using the settings specified in the default YAML configuration file `config.yaml`.
+The first command generates a helpful starter `config.yml` file and saves in the current working directory. This is provided to users to quickly get started and as a base for further modification.
+
+Then the second command initiates the fine-tuning process using the settings specified in the default YAML configuration file `config.yaml`.
 
 ### Intermediate
 
 The configuration file is the central piece that defines the behavior of the toolkit. It is written in YAML format and consists of several sections that control different aspects of the process, such as data ingestion, model definition, training, inference, and quality assurance. We highlight some of the critical sections.
 
+#### Flash Attention 2
+
+To enable Flash-attention for [supported models](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2). First install `flash-attn`:
+
+**pipx**
+
+```shell
+pipx inject llm-toolkit flash-attn --pip-args=--no-build-isolation
+```
+
+**pip**
+
+```
+pip install flash-attn --no-build-isolation
+```
+
+Then, add to config file.
+
+```yaml
+model:
+  torch_dtype: "bfloat16" # or "float16" if using older GPU
+  attn_implementation: "flash_attention_2"
+```
+
 #### Data Ingestion
 
 An example of what the data ingestion may look like:
@@ -247,6 +277,7 @@ NOTE: Be sure to merge the latest from "upstream" before making a pull request!
    # GPU
    docker run -it --gpus all llm-toolkit
 ```
+
 </details>
 
 <details>
@@ -257,6 +288,7 @@ See poetry documentation page for poetry [installation instructions](https://pyt
 ```shell
    poetry install
 ```
+
 </details>
 <details>
 <summary>pip</summary>
@@ -265,27 +297,23 @@ We recommend using a virtual environment like `venv` or `conda` for installation
 ```shell
    pip install -e .
 ```
+
 </details>
 </details>
 
-
-
 ### Checklist Before Pull Request (Optional)
 
 1. Use `ruff check --fix` to check and fix lint errors
 2. Use `ruff format` to apply formatting
 
 NOTE: Ruff linting and formatting checks are done when PR is raised via Git Action. Before raising a PR, it is a good practice to check and fix lint errors, as well as apply formatting.
 
-
 ### Releasing
 
-
-To manually release a PyPI package, please run: 
+To manually release a PyPI package, please run:
 
 ```shell
    make build-release
 ```
 
 Note: Make sure you have pypi token for this [PyPI repo](https://pypi.org/project/llm-toolkit/).
-
 
@@ -2,13 +2,19 @@
 import os
 from os import listdir
 from os.path import exists, join
+import shutil
+from pathlib import Path
+
 
 import torch
+import transformers
 import typer
 import yaml
 from pydantic import ValidationError
-from transformers import utils as hf_utils
+from typing_extensions import Annotated
 
+import llmtune
+from llmtune.constants.files import EXAMPLE_CONFIG_FNAME
 from llmtune.data.dataset_generator import DatasetGenerator
 from llmtune.finetune.lora import LoRAFinetune
 from llmtune.inference.lora import LoRAInference
@@ -19,14 +25,22 @@
 from llmtune.utils.save_utils import DirectoryHelper
 
 
-hf_utils.logging.set_verbosity_error()
+transformers.logging.set_verbosity(transformers.logging.CRITICAL)
 torch._logging.set_logs(all=logging.CRITICAL)
+logging.captureWarnings(True)
 
 
 app = typer.Typer()
+generate_app = typer.Typer()
+
+app.add_typer(
+    generate_app,
+    name="generate",
+    help="Generate various artefacts, such as config files",
+)
 
 
-def run_one_experiment(config: Config, config_path: str) -> None:
+def run_one_experiment(config: Config, config_path: Path) -> None:
     dir_helper = DirectoryHelper(config_path, config)
 
     # Loading Data -------------------------------
@@ -39,7 +53,7 @@ def run_one_experiment(config: Config, config_path: str) -> None:
     test_column = dataset_generator.test_column
 
     dataset_path = dir_helper.save_paths.dataset
-    if not exists(dataset_path):
+    if not dataset_path.exists():
         train, test = dataset_generator.get_dataset()
         dataset_generator.save_dataset(dataset_path)
     else:
@@ -55,7 +69,7 @@ def run_one_experiment(config: Config, config_path: str) -> None:
     weights_path = dir_helper.save_paths.weights
 
     # model_loader = ModelLoader(config, console, dir_helper)
-    if not exists(weights_path) or not listdir(weights_path):
+    if not weights_path.exists() or not any(weights_path.iterdir()):
         finetuner = LoRAFinetune(config, dir_helper)
         with RichUI.during_finetune():
             finetuner.finetune(train)
@@ -67,13 +81,13 @@ def run_one_experiment(config: Config, config_path: str) -> None:
     # Inference -------------------------------
     RichUI.before_inference()
     results_path = dir_helper.save_paths.results
-    results_file_path = join(dir_helper.save_paths.results, "results.csv")
-    if not exists(results_path) or exists(results_file_path):
+    results_file_path = dir_helper.save_paths.results_file
+    if not results_file_path.exists():
         inference_runner = LoRAInference(test, test_column, config, dir_helper)
         inference_runner.infer_all()
         RichUI.after_inference(results_path)
     else:
-        RichUI.inference_found(results_path)
+        RichUI.results_found(results_path)
 
     RichUI.before_qa()
     qa_path = dir_helper.save_paths.qa
@@ -85,10 +99,11 @@ def run_one_experiment(config: Config, config_path: str) -> None:
         test_suite.save_test_results(os.path.join(qa_path, "unit_test_results.csv"))
 
 
-@app.command()
-def run(config_path: str = "./config.yml") -> None:
+@app.command("run")
+def run(config_path: Annotated[str, typer.Argument(help="Path of the config yaml file")] = "./config.yml") -> None:
+    """Run the entire exmperiment pipeline"""
     # Load YAML config
-    with open(config_path, "r") as file:
+    with Path(config_path).open("r") as file:
         config = yaml.safe_load(file)
         configs = (
             generate_permutations(config, Config) if config.get("ablation", {}).get("use_ablate", False) else [config]
@@ -103,12 +118,24 @@ def run(config_path: str = "./config.yml") -> None:
         dir_helper = DirectoryHelper(config_path, config)
 
         # Reload config from saved config
-        with open(join(dir_helper.save_paths.config, "config.yml"), "r") as file:
+        with dir_helper.save_paths.config_file.open("r") as file:
             config = yaml.safe_load(file)
             config = Config(**config)
 
         run_one_experiment(config, config_path)
 
 
+@generate_app.command("config")
+def generate_config():
+    """
+    Generate an example `config.yml` file in current directory
+    """
+    module_path = Path(llmtune.__file__)
+    example_config_path = module_path.parent / EXAMPLE_CONFIG_FNAME
+    destination = Path.cwd()
+    shutil.copy(example_config_path, destination)
+    RichUI.generate_config(EXAMPLE_CONFIG_FNAME)
+
+
 def cli():
     app()
@@ -17,13 +17,15 @@ data:
   prompt_stub:
     >- # Stub to add for training at the end of prompt, for test set or inference, this is omitted; make sure only one variable is present
     {output}
-  test_size: 0.1 # Proportion of test as % of total; if integer then # of samples
-  train_size: 0.9 # Proportion of train as % of total; if integer then # of samples
+  test_size: 25 # Proportion of test as % of total; if integer then # of samples
+  train_size: 500 # Proportion of train as % of total; if integer then # of samples
   train_test_split_seed: 42
 
 # Model Definition -------------------
 model:
-  hf_model_ckpt: "NousResearch/Llama-2-7b-hf"
+  hf_model_ckpt: "mistralai/Mistral-7B-Instruct-v0.2"
+  torch_dtype: "bfloat16"
+  #attn_implementation: "flash_attention_2"
   quantize: true
   bitsandbytes:
     load_in_4bit: true
@@ -34,6 +36,7 @@ model:
 lora:
   task_type: "CAUSAL_LM"
   r: 32
+  lora_alpha: 64
   lora_dropout: 0.1
   target_modules:
     - q_proj
@@ -47,12 +50,12 @@ lora:
 # Training -------------------
 training:
   training_args:
-    num_train_epochs: 5
+    num_train_epochs: 1
     per_device_train_batch_size: 4
     gradient_accumulation_steps: 4
     gradient_checkpointing: True
     optim: "paged_adamw_32bit"
-    logging_steps: 100
+    logging_steps: 1
     learning_rate: 2.0e-4
     bf16: true # Set to true for mixed precision training on Newer GPUs
     tf32: true
@@ -61,11 +64,11 @@ training:
     warmup_ratio: 0.03
     lr_scheduler_type: "constant"
   sft_args:
-    max_seq_length: 5000
+    max_seq_length: 1024
     # neftune_noise_alpha: None
 
 inference:
-  max_new_tokens: 1024
+  max_new_tokens: 256
   use_cache: True
   do_sample: True
   top_p: 0.9
@@ -80,4 +83,4 @@ qa:
     - verb_percent
     - adjective_percent
     - noun_percent
-    - summary_length
+    - summary_length
@@ -0,0 +1,19 @@
+# Example config file
+EXAMPLE_CONFIG_FNAME = "config.yml"
+
+# DIRECTORY HELPER - HASH SETTING
+NUM_MD5_DIGITS_FOR_SQIDS = 2
+
+# DIRECTORY HELPER - DIRECTORY & FILE NAMES
+CONFIG_DIR_NAME = "config"
+CONFIG_FILE_NAME = "config.yml"
+
+DATASET_DIR_NAME = "dataset"
+
+WEIGHTS_DIR_NAME = "weights"
+
+RESULTS_DIR_NAME = "results"
+RESULTS_FILE_NAME = "results.csv"
+
+QA_DIR_NAME = "qa"
+QA_FILE_NAME = "qa_test_results.csv"
@@ -8,12 +8,14 @@
 def get_ingestor(data_type: str):
     if data_type == "json":
         return JsonIngestor
+    elif data_type == "jsonl":
+        return JsonlIngestor
     elif data_type == "csv":
         return CsvIngestor
     elif data_type == "huggingface":
         return HuggingfaceIngestor
     else:
-        raise ValueError(f"'type' must be one of 'json', 'csv', or 'huggingface', you have {data_type}")
+        raise ValueError(f"'type' must be one of 'json', 'jsonl', 'csv', or 'huggingface', you have {data_type}")
 
 
 class Ingestor(ABC):
@@ -35,6 +37,19 @@ def to_dataset(self) -> Dataset:
         return Dataset.from_generator(self._json_generator)
 
 
+class JsonlIngestor(Ingestor):
+    def __init__(self, path: str):
+        self.path = path
+
+    def _jsonl_generator(self):
+        with open(self.path, "rb") as f:
+            for item in ijson.items(f, "", multiple_values=True):
+                yield item
+
+    def to_dataset(self) -> Dataset:
+        return Dataset.from_generator(self._jsonl_generator)
+
+
 class CsvIngestor(Ingestor):
     def __init__(self, path: str):
         self.path = path