feat: add more PEFT lora recipes (#959)

ZhiyuLi-Nvidia · web-flow · commit 507bfb7514f0 · 2025-12-16T09:14:20.000Z
Signed-off-by: Zhiyu Li &lt;zhiyul@NVIDIA.com&gt;
diff --git a/docs/performance-summary.md b/docs/performance-summary.md
@@ -25,9 +25,12 @@ The table below shows finetuning (LoRA) performance for full sequences with no p
 
 | Model | #GPUs | GBS | MBS | LBS | GA | Seq Length | TP | PP | CP | EP | VP | FSDP | Kernel Optimizations | Time per Global Step (s) | Model TFLOPs/sec/GPU | Tokens/sec/GPU |
 |-------|------:|----:|----:|----:|---:|-----------:|---:|---:|---:|---:|---:|-----:|---------|-------------------------:|---------------------:|---------------:|
+| Llama3 8B | 1 | 32 | 2 | 2 | 16 | 4096 | 1 | 1 | 1 | - | 1 | 1 | - | 10.51 | 402 | 12472.87 |
+| Qwen2.5 7B | 1 | 32 | 2 | 2 | 16 | 4096 | 1 | 1 | 1 | - | 1 | 1 | - | 9.29 | 423 | 14110.05 |
 | Llama3 70B | 8 | 32 | 1 | 4 | 4 | 4096 | 2 | 4 | 1 | - | 10 | 1 | - | 26.92 | 176 | 608.42 |
-| Qwen2.5 32B | 8 | 32 | 1 | 8 | 2 | 4096 | 1 | 4 | 1 | - | 8 | 1 | - | 8.40 | 261 | 1950.93 |
-
+| Qwen2.5 32B | 8 | 32 | 1 | 8 | 2 | 4096 | 1 | 4 | 1 | - | 8 | 1 | 2 | 8.40 | 261 | 1950.93 |
+| Llama3 70B 2-node | 16 | 32 | 1 | 4 | 2 | 4096 | 2 | 4 | 1 | - | 10 | 1 | 2 | 12.78 | 185 | 640.95 |
+| Qwen2.5 32B 2-node | 16 | 32 | 1 | 8 | 1 | 4096 | 1 | 4 | 1 | - | 8 | 1 | 4 | 4.48 | 244 | 1826.49 |
 ## Glossary
 
 - **MFU**: Model FLOPs Utilization - ratio of achieved compute to peak hardware capability
@@ -55,8 +58,12 @@ All benchmark configurations are available in [`examples/benchmark/configs/`](ht
 - [`qwen3_moe_30b_te_deepep.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/benchmark/configs/qwen3_moe_30b_te_deepep.yaml) - Qwen3 MoE with TE + DeepEP
 - [`gptoss_20b_te_deepep.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/benchmark/configs/gptoss_20b_te_deepep.yaml) - GPT-OSS 20B with optimizations
 - [`gptoss_120b_te_deepep.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/benchmark/configs/gptoss_120b_te_deepep.yaml) - GPT-OSS 120B optimized
+- [`Llama_8b_lora.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune/llama3_1/llama3_1_8b_peft_benchmark.yaml) - Llama-8B Finetuning (LoRA) optimized
+- [`Qwen2_5_7b_lora.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune/qwen/qwen2_5_7b_peft_benchmark.yaml) - Qwen2.5-7B Finetuning (LoRA) optimized
 - [`Llama_70b_lora.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark.yaml) - Llama-70B Finetuning (LoRA) optimized
 - [`Qwen2_5_32b_lora.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune/qwen/qwen2_5_32b_peft_benchmark.yaml) - Qwen2.5-32B Finetuning (LoRA) optimized
+- [`Llama_70b_lora_2nodes.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark_2nodes.yaml) - Llama-70B Finetuning (LoRA) optimized on 2 nodes
+- [`Qwen2_5_32b_lora_2nodes.yaml`](https://github.com/NVIDIA-NeMo/Automodel/tree/main/examples/llm_finetune/qwen/qwen2_5_32b_peft_benchmark_2nodes.yaml) - Qwen2.5-32B Finetuning (LoRA) optimized on 2 nodes
 
 ---
 
diff --git a/examples/llm_finetune/llama3_1/llama3_1_8b_peft_benchmark.yaml b/examples/llm_finetune/llama3_1/llama3_1_8b_peft_benchmark.yaml
@@ -0,0 +1,105 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# QLora configuration for Llama-3.1-8B on SQuAD dataset
+# Uses 4-bit quantization with LoRA adapters
+#
+# To run this recipe, please use the following command:
+# torchrun --nproc-per-node=1 nemo_automodel/recipes/llm/benchmark.py --config examples/llm_finetune/llama3_1/llama3_1_8b_peft_benchmark.yaml
+
+seed: 42
+
+# NEW: Add benchmark section
+benchmark:
+  warmup_steps: 5
+  peak_tflops: 989  # H100: 989, A100: 312
+  nsys_start: -1    # Set to step number to profile (e.g., 10)
+  nsys_end: -1      # Set to end step (e.g., 15)
+  nsys_ranks: []    # e.g., [0] to profile rank 0
+  num_nodes: 1
+
+step_scheduler:
+  global_batch_size: 32
+  local_batch_size: 2
+  ckpt_every_steps: 50
+  val_every_steps: 1000
+  max_steps: 10
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 1
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 42
+  ranked: true
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
+  pretrained_model_name_or_path: meta-llama/Llama-3.1-8B
+
+peft:
+  _target_: nemo_automodel.components._peft.lora.PeftConfig
+  match_all_linear: true
+  dim: 16
+  alpha: 32
+  dropout: 0.1
+  use_triton: true
+
+distributed:
+  _target_: nemo_automodel.components.distributed.fsdp2.FSDP2Manager
+  dp_size: none
+  dp_replicate_size: 1
+  tp_size: 1
+  cp_size: 1
+  sequence_parallel: false
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+# Use MockIterableDataset for benchmarking (faster, no I/O)
+dataset:
+  _target_: nemo_automodel.components.datasets.llm.mock_iterable_dataset.MockIterableDataset
+  vocab_size: 100
+  seq_len: 4096
+  num_samples: 1000000
+  batch_size: 2
+
+dataloader:
+  _target_: torch.utils.data.DataLoader
+  batch_size: null  # Dataset already yields batches
+  # Note: model_config will be auto-injected by train_ft.py for PP models
+
+# validation_dataset:
+#   _target_: nemo_automodel.components.datasets.llm.column_mapped_text_instruction_dataset.ColumnMappedTextInstructionDataset
+#   path_or_dataset_id: Muennighoff/natural-instructions
+#   split: validation
+#   column_mapping:
+#     instruction: definition
+#     question: inputs
+#     answer: targets
+
+optimizer:
+  _target_: torch.optim.AdamW
+  betas: [0.9, 0.999]
+  eps: 1e-8
+  lr: 1.0e-5
+  weight_decay: 0.01
+
+# Uncomment and configure for W&B logging
+# wandb:
+#   project: <your_wandb_project>
+#   entity: <your_wandb_entity>
+#   name: llama3_1_8b_squad_qlora
+#   save_dir: <your_wandb_save_dir> 
diff --git a/examples/llm_finetune/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark_2nodes.yaml b/examples/llm_finetune/llama3_3/custom_llama3_3_70b_instruct_peft_benchmark_2nodes.yaml
@@ -0,0 +1,90 @@
+# Based on your existing config, modified for benchmarking
+
+seed: 42
+
+# NEW: Add benchmark section
+benchmark:
+  warmup_steps: 5
+  peak_tflops: 989  # H100: 989, A100: 312
+  nsys_start: -1    # Set to step number to profile (e.g., 10)
+  nsys_end: -1      # Set to end step (e.g., 15)
+  nsys_ranks: []    # e.g., [0] to profile rank 0
+  num_nodes: 2
+
+step_scheduler:
+  global_batch_size: 32
+  local_batch_size: 8
+  ckpt_every_steps: 50
+  val_every_steps: 1000
+  max_steps: 10
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 1
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 1111
+  ranked: true
+
+model:
+  _target_: nemo_automodel.components.models.llama.model.build_llama_model
+  pretrained_model_name_or_path: meta-llama/Llama-3.3-70B-Instruct
+  torch_dtype: bf16
+
+peft:
+  _target_: nemo_automodel.components._peft.lora.PeftConfig
+  match_all_linear: True
+  dim: 16
+  alpha: 32
+  use_triton: True
+
+checkpoint:
+  enabled: false
+  checkpoint_dir: checkpoints/
+  model_save_format: safetensors
+  save_consolidated: false
+
+distributed:
+  _target_: nemo_automodel.components.distributed.fsdp2.FSDP2Manager
+  dp_size: 2
+  tp_size: 2
+  cp_size: 1
+  pp_size: 4
+  sequence_parallel: false
+  activation_checkpointing: true
+
+autopipeline:
+  _target_: nemo_automodel.components.distributed.pipelining.autopipeline.AutoPipeline
+  pp_schedule: interleaved1f1b
+  pp_microbatch_size: 1
+  layers_per_stage: 2
+  scale_grads_in_schedule: false
+  round_virtual_stages_to_pp_multiple: up
+  dtype: bf16
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+# Use MockIterableDataset for benchmarking (faster, no I/O)
+dataset:
+  _target_: nemo_automodel.components.datasets.llm.mock_iterable_dataset.MockIterableDataset
+  vocab_size: 100
+  seq_len: 4096
+  num_samples: 1000000
+
+dataloader:
+  _target_: torch.utils.data.DataLoader
+  batch_size: null  # Dataset already yields batches
+  # Note: model_config will be auto-injected by train_ft.py for PP models
+
+optimizer:
+  _target_: torch.optim.Adam
+  betas: [0.9, 0.999]
+  eps: 1e-8
+  lr: 1.0e-5
+  weight_decay: 0
+
+lr_scheduler:
+  lr_decay_style: cosine
+  min_lr: 1.0e-6 
diff --git a/examples/llm_finetune/qwen/qwen2_5_32b_peft_benchmark_2nodes.yaml b/examples/llm_finetune/qwen/qwen2_5_32b_peft_benchmark_2nodes.yaml
@@ -0,0 +1,92 @@
+# Custom Qwen2.5-72B with combined projections for benchmarking
+# Based on Qwen2_72B_peft_benchmark.yaml but using custom model implementation
+# Similar to custom_llama3_3_70b_instruct_peft_benchmark.yaml
+
+seed: 42
+
+# Benchmark section
+benchmark:
+  warmup_steps: 5
+  peak_tflops: 989  # H100: 989, A100: 312
+  nsys_start: -1    # Set to step number to profile (e.g., 10)
+  nsys_end: -1      # Set to end step (e.g., 15)
+  nsys_ranks: []    # e.g., [0] to profile rank 0
+  num_nodes: 2
+
+step_scheduler:
+  global_batch_size: 32
+  local_batch_size: 8
+  ckpt_every_steps: 50
+  val_every_steps: 1000
+  max_steps: 10
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 1
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 1111
+  ranked: true
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_pretrained
+  pretrained_model_name_or_path: Qwen/Qwen2.5-32B-Instruct
+  torch_dtype: bf16
+
+peft:
+  _target_: nemo_automodel.components._peft.lora.PeftConfig
+  match_all_linear: True
+  dim: 16
+  alpha: 32
+  use_triton: True
+
+checkpoint:
+  enabled: false
+  checkpoint_dir: checkpoints/
+  model_save_format: safetensors
+  save_consolidated: false
+
+distributed:
+  _target_: nemo_automodel.components.distributed.fsdp2.FSDP2Manager
+  dp_size: 4
+  tp_size: 1
+  cp_size: 1
+  pp_size: 4
+  sequence_parallel: false
+  activation_checkpointing: true
+
+autopipeline:
+  _target_: nemo_automodel.components.distributed.pipelining.autopipeline.AutoPipeline
+  pp_schedule: interleaved1f1b
+  pp_microbatch_size: 1
+  layers_per_stage: 2
+  scale_grads_in_schedule: false
+  round_virtual_stages_to_pp_multiple: up
+  dtype: bf16
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+# Use MockIterableDataset for benchmarking (faster, no I/O)
+dataset:
+  _target_: nemo_automodel.components.datasets.llm.mock_iterable_dataset.MockIterableDataset
+  vocab_size: 100
+  seq_len: 4096
+  num_samples: 1000000
+
+dataloader:
+  _target_: torch.utils.data.DataLoader
+  batch_size: null  # Dataset already yields batches
+  # Note: model_config will be auto-injected by train_ft.py for PP models
+
+optimizer:
+  _target_: torch.optim.Adam
+  betas: [0.9, 0.999]
+  eps: 1e-8
+  lr: 1.0e-5
+  weight_decay: 0
+
+lr_scheduler:
+  lr_decay_style: cosine
+  min_lr: 1.0e-6
diff --git a/examples/llm_finetune/qwen/qwen2_5_7b_peft_benchmark.yaml b/examples/llm_finetune/qwen/qwen2_5_7b_peft_benchmark.yaml
diff --git a/examples/llm_pretrain/llama3_70b_pretrain.yaml b/examples/llm_pretrain/llama3_70b_pretrain.yaml