Lightning-AI · belerico · Apr 24, 2024 · Apr 25, 2024 · Apr 25, 2024 · Apr 25, 2024
@@ -0,0 +1,141 @@
+
+# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
+checkpoint_dir: checkpoints/meta-llama/Llama-2-7b-hf
+
+# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
+out_dir: out/finetune/lora-llama2-7b
+
+# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-true
+
+# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
+quantize:
+
+# How many devices/GPUs to use. (type: Union[int, str], default: 1)
+devices: 1
+
+# The LoRA rank. (type: int, default: 8)
+lora_r: 8
+
+# The LoRA alpha. (type: int, default: 16)
+lora_alpha: 16
+
+# The LoRA dropout value. (type: float, default: 0.05)
+lora_dropout: 0.0
+
+# Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
+lora_query: true
+
+# Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
+lora_key: true
+
+# Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
+lora_value: true
+
+# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
+lora_projection: true
+
+# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
+lora_mlp: false
+
+# Whether to apply LoRA to output head in GPT. (type: bool, default: False)
+lora_head: false
+
+# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
+data:
+  class_path: litgpt.data.Alpaca2k
+  init_args:
+    mask_prompt: false
+    prompt_style: alpaca
+    ignore_index: -100
+    seed: 42
+    num_workers: 4
+
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 200
+
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
+  global_batch_size: 8
+
+  # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 2
+
+  # Number of iterations with learning rate warmup active (type: int, default: 100)
+  lr_warmup_steps: 10
+
+  # Number of epochs to train on (type: Optional[int], default: 5)
+  epochs: 4
+
+  # Total number of tokens to train on (type: Optional[int], default: null)
+  max_tokens:
+
+  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
+  max_steps:
+
+  # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 512
+
+  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
+  tie_embeddings:
+
+  #   (type: Optional[float], default: null)
+  max_norm:
+
+  #   (type: float, default: 6e-05)
+  min_lr: 6.0e-05
+
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+
+  # Number of optimizer steps between evaluation calls (type: int, default: 100)
+  interval: 100
+
+  # Number of tokens to generate (type: Optional[int], default: 100)
+  max_new_tokens: 100
+
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+
+# LongLoRA-related arguments. See ``litgpt.args.LongLoRAArgs`` for details
+longlora:
+  # Whether to use LongLoRA. (type: bool, default: false)
+  use_longlora: true
+
+  # The enlarged context length for LongLoRA. (type: int, default: 8192)
+  context_length: 8192
+
+  # The number of groups to split the sequence into. (type: int, default: 4)
+  n_groups: 4
+
+  # The additional trainable parameters for LongLoRA. (type: str, default: "wte,norm,ln")
+  trainable_params: "wte,norm,ln"
+
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
+logger_name: csv
+
+# The random seed to use for reproducibility. (type: int, default: 1337)
+seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+
+  init_args:
+
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
@@ -0,0 +1,141 @@
+
+# The path to the base model's checkpoint directory to load for finetuning. (type: <class 'Path'>, default: checkpoints/stabilityai/stablelm-base-alpha-3b)
+checkpoint_dir: checkpoints/mistralai/Mistral-7B-v0.1
+
+# Directory in which to save checkpoints and logs. (type: <class 'Path'>, default: out/lora)
+out_dir: out/finetune/lora-mistral-7b
+
+# The precision to use for finetuning. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
+precision: bf16-true
+
+# If set, quantize the model with this algorithm. See ``tutorials/quantize.md`` for more information. (type: Optional[Literal['nf4', 'nf4-dq', 'fp4', 'fp4-dq', 'int8-training']], default: null)
+quantize:
+
+# How many devices/GPUs to use. (type: Union[int, str], default: 1)
+devices: 1
+
+# The LoRA rank. (type: int, default: 8)
+lora_r: 8
+
+# The LoRA alpha. (type: int, default: 16)
+lora_alpha: 16
+
+# The LoRA dropout value. (type: float, default: 0.05)
+lora_dropout: 0.0
+
+# Whether to apply LoRA to the query weights in attention. (type: bool, default: True)
+lora_query: true
+
+# Whether to apply LoRA to the key weights in attention. (type: bool, default: False)
+lora_key: true
+
+# Whether to apply LoRA to the value weights in attention. (type: bool, default: True)
+lora_value: true
+
+# Whether to apply LoRA to the output projection in the attention block. (type: bool, default: False)
+lora_projection: true
+
+# Whether to apply LoRA to the weights of the MLP in the attention block. (type: bool, default: False)
+lora_mlp: false
+
+# Whether to apply LoRA to output head in GPT. (type: bool, default: False)
+lora_head: false
+
+# Data-related arguments. If not provided, the default is ``litgpt.data.Alpaca``.
+data:
+  class_path: litgpt.data.Alpaca2k
+  init_args:
+    mask_prompt: false
+    prompt_style: alpaca
+    ignore_index: -100
+    seed: 42
+    num_workers: 4
+
+# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
+train:
+
+  # Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
+  save_interval: 200
+
+  # Number of iterations between logging calls (type: int, default: 1)
+  log_interval: 1
+
+  # Number of samples between optimizer steps across data-parallel ranks (type: int, default: 128)
+  global_batch_size: 8
+
+  # Number of samples per data-parallel rank (type: int, default: 4)
+  micro_batch_size: 2
+
+  # Number of iterations with learning rate warmup active (type: int, default: 100)
+  lr_warmup_steps: 10
+
+  # Number of epochs to train on (type: Optional[int], default: 5)
+  epochs: 4
+
+  # Total number of tokens to train on (type: Optional[int], default: null)
+  max_tokens:
+
+  # Limits the number of optimizer steps to run. (type: Optional[int], default: null)
+  max_steps:
+
+  # Limits the length of samples. Off by default (type: Optional[int], default: null)
+  max_seq_length: 512
+
+  # Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: null)
+  tie_embeddings:
+
+  #   (type: Optional[float], default: null)
+  max_norm:
+
+  #   (type: float, default: 6e-05)
+  min_lr: 6.0e-05
+
+# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
+eval:
+
+  # Number of optimizer steps between evaluation calls (type: int, default: 100)
+  interval: 100
+
+  # Number of tokens to generate (type: Optional[int], default: 100)
+  max_new_tokens: 100
+
+  # Number of iterations (type: int, default: 100)
+  max_iters: 100
+
+# LongLoRA-related arguments. See ``litgpt.args.LongLoRAArgs`` for details
+longlora:
+  # Whether to use LongLoRA. (type: bool, default: false)
+  use_longlora: true
+
+  # The enlarged context length for LongLoRA. (type: int, default: 8192)
+  context_length: 8192
+
+  # The number of groups to split the sequence into. (type: int, default: 4)
+  n_groups: 4
+
+  # The additional trainable parameters for LongLoRA. (type: str, default: "wte,norm,ln")
+  trainable_params: "wte,norm,ln"
+
+# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: csv)
+logger_name: csv
+
+# The random seed to use for reproducibility. (type: int, default: 1337)
+seed: 1337
+
+# Optimizer-related arguments
+optimizer:
+
+  class_path: torch.optim.AdamW
+
+  init_args:
+
+    #   (type: float, default: 0.001)
+    lr: 0.0002
+
+    #   (type: float, default: 0.01)
+    weight_decay: 0.0
+
+    #   (type: tuple, default: (0.9,0.999))
+    betas:
+      - 0.9
+      - 0.95
@@ -36,14 +36,6 @@ class TrainArgs:
     max_norm: Optional[float] = None
     min_lr: float = 6e-5
 
-    def __post_init__(self) -> None:
-        if self.lr_warmup_fraction and self.lr_warmup_steps:
-            raise ValueError(
-                "Can't provide both `--train.lr_warmup_fraction` and `--train.lr_warmup_steps`. Choose one."
-            )
-        if self.lr_warmup_fraction and not (0 <= self.lr_warmup_fraction <= 1):
-            raise ValueError("`--train.lr_warmup_fraction` must be between 0 and 1.")
-
     def gradient_accumulation_iters(self, devices: int) -> int:
         """Number of iterations between gradient synchronizations"""
         gradient_accumulation_iters = self.batch_size(devices) // self.micro_batch_size
@@ -77,3 +69,17 @@ class EvalArgs:
     """Number of iterations"""
     initial_validation: bool = False
     """Whether to evaluate on the validation set at the beginning of the training"""
+
+
+@dataclass
+class LongLoraArgs:
+    """LongLora-related arguments"""
+
+    use_longlora: bool = False
+    """Whether to enable LongLora."""
+    n_groups: int = 4
+    """Number of groups to divide the sequence length into."""
+    context_length: int = 8192
+    """Length of the enlarged context window."""
+    trainable_params: str = "wte,norm,ln"
+    """List of comma-separated parameters to train in LongLora."""
@@ -61,6 +61,8 @@ class Config:
     rope_base: int = 10000
     n_expert: int = 0
     n_expert_per_token: int = 0
+    use_longlora: bool = False
+    longlora_n_groups: int = 4
 
     def __post_init__(self):
         if not self.name:

@@ -43,6 +43,7 @@ class Alpaca(DataModule):
     tokenizer: Optional[Tokenizer] = field(default=None, init=False, repr=False)
     batch_size: int = field(default=1, init=False, repr=False)
     max_seq_length: int = field(default=-1, init=False, repr=False)
+    pad_multiple_of: Optional[int] = field(default=None, init=False, repr=False)
     train_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
     test_dataset: Optional[SFTDataset] = field(default=None, init=False, repr=False)
 
@@ -51,11 +52,16 @@ def __post_init__(self) -> None:
             self.prompt_style = PromptStyle.from_name(self.prompt_style)
 
     def connect(
-        self, tokenizer: Optional[Tokenizer] = None, batch_size: int = 1, max_seq_length: Optional[int] = None
+        self,
+        tokenizer: Optional[Tokenizer] = None,
+        batch_size: int = 1,
+        max_seq_length: Optional[int] = None,
+        pad_multiple_of: Optional[int] = None,
     ) -> None:
         self.tokenizer = tokenizer
         self.batch_size = batch_size
         self.max_seq_length = -1 if max_seq_length is None else max_seq_length
+        self.pad_multiple_of = pad_multiple_of
 
     def prepare_data(self) -> None:
         self.download_dir.mkdir(parents=True, exist_ok=True)
@@ -97,7 +103,9 @@ def train_dataloader(self) -> DataLoader:
             shuffle=True,
             generator=torch.Generator().manual_seed(self.seed),
             num_workers=self.num_workers,
-            collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index),
+            collate_fn=get_sft_collate_fn(
+                max_seq_length=self.max_seq_length, ignore_index=self.ignore_index, pad_multiple_of=self.pad_multiple_of
+            ),
         )
 
     def val_dataloader(self) -> DataLoader:
@@ -106,7 +114,9 @@ def val_dataloader(self) -> DataLoader:
             batch_size=self.batch_size,
             shuffle=False,
             num_workers=self.num_workers,
-            collate_fn=get_sft_collate_fn(max_seq_length=self.max_seq_length, ignore_index=self.ignore_index),
+            collate_fn=get_sft_collate_fn(
+                max_seq_length=self.max_seq_length, ignore_index=self.ignore_index, pad_multiple_of=self.pad_multiple_of
+            ),
         )