VectorInstitute · jacobthebanana · Feb 26, 2024 · Feb 27, 2024 · Feb 29, 2024 · Feb 29, 2024
diff --git a/.gitignore b/.gitignore
@@ -9,4 +9,5 @@ data/
 **/*.pyc
 /.cache
 /.vscode
-/data
+/data
+/env
diff --git a/configs/config.yaml b/configs/config.yaml
@@ -21,15 +21,14 @@ train_parameters:
   low_cpu_mem_usage: True
 
   # LoRA config: uncomment the block below to enable LoRA
-  
+
   # lora_peft_config:
   #   task_type: CAUSAL_LM
   #   inference_mode: False
   #   r: 8
   #   lora_alpha: 32
   #   lora_dropout: 0.1
 
-
   # Gradient norm clipping
   max_grad_norm: 1
   gradient_accumulation_steps: 4
@@ -50,6 +49,17 @@ train_parameters:
   logging_steps: 500
   save_frequency: 0.25
 
+  # Sampling during training
+  # Uncomment the block below to enable.
+
+  # sampler:
+  #   sample_frequency: 8
+  #   output_jsonl_path: data/output.jsonl
+  #   prompts:
+  #     - "Vector Institute of the"
+  #     - "Vector Institute is located in the city of"
+  #     - "The answer to life the universe and everything is"
+
 dataset:
   ignore_index: -100
   eval_bs: 8

diff --git a/configs/config_gemma.yaml b/configs/config_gemma.yaml
@@ -0,0 +1,68 @@
+model: google/gemma-2b
+enable_wandb_logging: False
+
+wandb_config:
+  project: vector-lm-verify
+  name: benchmark-lora
+  # tags: ["20240418-1a-preemption"]
+
+train_parameters:
+  output_dir: weights
+  max_seq_len: 128
+  epochs: 10
+  seed: 11
+
+  # Sharding strategy
+  sharding_strategy: FULL_SHARD
+
+  # Memory
+  use_mp: True
+  use_activation_checkpointing: True
+  # use_flash_attention is automatically enabled
+  # for CUDA capability > 8.0
+  use_flash_attention: False
+  low_cpu_mem_usage: True
+
+  lora_peft_config:
+    task_type: CAUSAL_LM
+    inference_mode: False
+    r: 8
+    lora_alpha: 32
+    lora_dropout: 0.1
+
+  # Gradient norm clipping
+  max_grad_norm: 1
+  gradient_accumulation_steps: 4
+
+  # Optimizer
+  optimizer:
+    lr: 1.0e-4
+    weight_decay: 0.1
+    betas: [0.9, 0.95]
+    eps: 1.0e-5
+
+  # Scheduler
+  lr_scheduler_type: cosine
+  warmup_ratio: 0.05
+
+  # Checkpointing
+  checkpointing_enabled: False
+  logging_steps: 10
+  save_frequency: 0.10
+
+  # Sampling during training
+  sampler:
+    sample_frequency: 8
+    output_jsonl_path: data/output-5e-5-2b.jsonl
+    vllm_dtype: half
+    prompts:
+      - "Vector Institute of the"
+      - "Vector Institute is located in the city of"
+      - "The answer to life the universe and everything is"
+
+dataset:
+  ignore_index: -100
+  eval_bs: 8
+  train_bs: 8
+  train_ds: data/processed/vector-west/train
+  eval_ds: data/processed/vector-west/test
diff --git a/docs/config.md b/docs/config.md
@@ -51,6 +51,22 @@ Similar to the wandb config above, these keyword parameters are fed directly int
 * `logging_steps`: How often evaluation is run using the evaluation dataset.
 * `save_frequency`: The frequency at which checkpointing occurs. This must be between 0 and 1.
 
+
+### Sampling during Training
+
+To disable sampling during training, delete the entire "sampling" section.
+
+* `sample_frequency`: Number of train steps between two consecutive sampling steps.
+* `output_jsonl_path`: Optional; write sampled output to the specified jsonl file.
+* `prompts`: YAML list of prompt strings.
+
+Each line of the output jsonl file would be a dictionary with keys:
+
+* `tr_step`: number (integer), trainer step when this line was generated.
+* `prompt`: string.
+* `options`: list of strings, one for each possible option that the sampler provided.
+* `time_taken`: float, number of seconds taken to generate **all** prompts at this step.
+
 ## Dataset
 
 * `ignore_index`: The integer index used to ignore a given token in the loss calculation. Cross-entropy loss by default uses `-100`.

diff --git a/docs/sampling.md b/docs/sampling.md
@@ -0,0 +1,26 @@
+# Efficient Sampling during training
+
+Some training objectives, noteably PPO, require "sampling" from the language model many times during training. The most straightforward approach might be to invoke model.generate on the model from within the training loop. Nevertheless, there have been a number of alternative inference approaches, including vLLM and others, promising over 10x the sampling throughput in terms of tokens generated per second when using a large sampling batch size. If model.generate is taking up too much of the training time, it might be worthwhile looking into these third-party solutions for speeding up the sampling process.
+
+One main challenge of running these third-party solutions, however, is that most of them assume that the weights of the language model are fixed, such that there isn't a straightforward way of updating these weights. Usually, updating the weights requires restarting the sampling engine, which sometimes take minutes. At the same time, the performance of PPO and similar techniques heavily rely on the ability to replace the weights efficiently, or else the training would no longer be on-policy and convergence would take substantially more training steps. To resolve this issue, we implemented techniques to "hot-swap" the model parameters that are used in the sampling process.
+
+Additionally, it is not straightforward to ensure a consistently high GPU utilization when combining sampling with training.
+This repository enables you to make the most out of all your GPUs by fitting vLLM and your training loop into the same set of devices. This way, none of the GPUs would sit idle- if a GPU is not running training, it would be busy sampling using vLLM. These slides ([link](https://docs.google.com/presentation/d/1FCa5O8RYYkRRCAAcXhqCvomePo5fEfhjQciSteTEJ30/edit?usp=sharing)) provide an overview of the architecture behind this approach.
+
+## Example- Supervised fine-tuning
+
+We provide a basic example that samples from the language model while fine-tuning using a basic causal language modelling objective. To run the example, uncomment the "sampler" section in your configuration yaml, choose a port for `nccl` coordination, and run the following command (not using torchrun):
+
+```
+export MASTER_ADDR=127.0.0.1
+export MASTER_PORT=19132
+python3 examples/llama_example_mp.py \
+--yaml_path configs/config.yaml \
+--world_size 2
+```
+
+## Bring your own training loop
+
+While the reference implementation is only for supervised fine-tuning, we provide abstractions that make it easier for you to implement your own training loop- be it PPO RLHF, TWIST, or something else. The goal is to abstract away all the synchronization logic, so that a training loop you've built on one GPU could scale to multiple GPUs on the same server with minimal modifications.
+
+To get started, refer to examples/llama_example.py and vectorlm/trainer.py. Usually, the vLLM Engine is accessible only from the rank 0, making synchronization challenging. When invoked through llama_example_mp, the `SamplingEngine` interface in VectorLM enables your training loop to access vLLM.LLM.generate from all ranks, returning the same result across all ranks. Note that because the synchronization barriers require all ranks to reach the synchronization point, you need to invoke `generate` from all ranks.
diff --git a/examples/__init__.py b/examples/__init__.py
diff --git a/examples/launch_lora.sh b/examples/launch_lora.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#SBATCH --job-name=llama7b-2
+#SBATCH --nodes=1
+#SBATCH --mem=0
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-gpu=6
+#SBATCH --gres=gpu:4
+#SBATCH --output=llama-2-7b.%j.out
+#SBATCH --error=llama-2-7b.%j.err
+#SBATCH --partition=a100
+#SBATCH --qos=your_assigned_qos  # CHANGE
+#SBATCH --open-mode=append
+#SBATCH --wait-all-nodes=1
+#SBATCH --time=3-00
+
+export NCCL_IB_DISABLE=1  # Our cluster does not have InfiniBand. We need to disable usage using this flag.
+export NCCL_DEBUG=WARN
+export NCCL_DEBUG_SUBSYS=WARN
+
+# export TORCH_DISTRIBUTED_DEBUG=DETAIL  # Uncomment these flags for debugging communication
+# export TORCH_CPP_LOG_LEVEL=INFO
+export LOGLEVEL=INFO
+export PYTHONFAULTHANDLER=1
+# export CUDA_LAUNCH_BLOCKING=0
+
+torchrun --nnodes=1 --nproc-per-node=${SLURM_GPUS_ON_NODE} example_lora.py --yaml_path configs/config-lora.yaml
diff --git a/examples/launch_lora_one_gpu.sh b/examples/launch_lora_one_gpu.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#SBATCH --job-name=llama7b-2-lora
+#SBATCH --nodes=1
+#SBATCH --mem=32GB
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-gpu=6
+#SBATCH --gres=gpu:1
+#SBATCH --output=llama-2-7b-lora.%j.out
+#SBATCH --error=llama-2-7b-lora.%j.err
+#SBATCH --partition=a100
+#SBATCH --qos=your_assigned_qos  # CHANGE
+#SBATCH --open-mode=append
+#SBATCH --wait-all-nodes=1
+#SBATCH --time=3-00
+
+export NCCL_IB_DISABLE=1  # Our cluster does not have InfiniBand. We need to disable usage using this flag.
+export NCCL_DEBUG=WARN
+export NCCL_DEBUG_SUBSYS=WARN
+
+# export TORCH_DISTRIBUTED_DEBUG=DETAIL  # Uncomment these flags for debugging communication
+# export TORCH_CPP_LOG_LEVEL=INFO
+export LOGLEVEL=INFO
+export PYTHONFAULTHANDLER=1
+# export CUDA_LAUNCH_BLOCKING=0
+
+torchrun --nnodes=1 --nproc-per-node=1 example_lora.py --yaml_path configs/config-lora.yaml
diff --git a/examples/llama_example.py b/examples/llama_example.py
@@ -5,6 +5,7 @@
 import os
 import sys
 from argparse import Namespace
+from typing import TYPE_CHECKING, Callable
 
 import torch
 import torch.distributed as dist
@@ -30,6 +31,9 @@
     save_peft_adapter,
 )
 
+if TYPE_CHECKING:
+    from vectorlm.sampling.utils import AbstractSamplingEngine
+
 
 def parse_args() -> Namespace:
     """Parse command-line arguments.
@@ -48,8 +52,28 @@ def parse_args() -> Namespace:
     return parser.parse_args()
 
 
-def main(config: Config) -> None:
-    """Define the main calling function."""
+def main(
+    config: Config,
+    get_sampling_engine: Callable[[], AbstractSamplingEngine] | None = None,
+) -> None:
+    """Define the main calling function.
+
+    WORLD_SIZE, LOCAL_RANK, and RANK are retrieved from environment vars.
+
+    Args:
+    ----
+        config: vectorlm config, e.g., loaded from yaml
+        get_sampling_engine: optional, blocking function that initializes the
+            sampling engine. Required if sampling during training is needed.
+            This method is provided in _VLLMCallbackWrapper. To avoid concurrent
+            nccl access, be sure to invoke this method before any torch method
+            that might also access nccl.
+
+    """
+    sampling_engine = (
+        get_sampling_engine() if get_sampling_engine is not None else None
+    )
+
     training_args = config.train_parameters
 
     # set a seed
@@ -66,7 +90,7 @@ def main(config: Config) -> None:
         torch.cuda.empty_cache()
 
     # setup wandb
-    if rank == 0:
+    if rank == 0 and config.enable_wandb_logging:
         wandb_setup(config, **config.wandb_config)
     dist.barrier()
 
@@ -87,7 +111,9 @@ def main(config: Config) -> None:
         is_lora_enabled = True
         peft_adapter_path = None
         # Restore peft adapter from filesystem if available.
-        if checkpoint_exists(training_args.output_dir):
+        if (training_args.checkpointing_enabled) and checkpoint_exists(
+            training_args.output_dir,
+        ):
             peft_adapter_path = os.path.join(
                 training_args.output_dir,
                 "checkpoints",
@@ -115,6 +141,9 @@ def main(config: Config) -> None:
         training_args.low_cpu_mem_usage,
         is_lora_enabled,
     )
+    # Trigger FSDP initialization before retrieving weights.
+    # Otherwise FSDP is_root flag might be set incorrectly.
+    model(input_ids=torch.zeros((1, 1), dtype=torch.int))
 
     # load dataset
     dataset = Dataset(
@@ -151,6 +180,7 @@ def main(config: Config) -> None:
         dataset,
         optimizer,
         lr_scheduler,
+        sampling_engine,
         is_peft_adapter_restored,
     )
 
@@ -159,7 +189,6 @@ def main(config: Config) -> None:
     checkpointed_epoch = trainer.find_checkpoint(training_args.output_dir)
 
     for epoch in range(checkpointed_epoch, training_args.epochs):
-        trainer.model.train()
         train_dl_iterator = iter(dataset.train_dataloader)
         for _ in tqdm(
             range(len(dataset.train_dataloader)),
@@ -185,6 +214,8 @@ def main(config: Config) -> None:
             save_consolidated_model(trainer.model, hf_save_dir, rank)
         dataset.reset_dataloaders()
 
+    sys.exit(0)
+
 
 if __name__ == "__main__":
     args = parse_args()
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,4 +9,5 @@ data/ @@
     **/*.pyc
     /.cache
     /.vscode
-    /data
+    /data
+    /env