NVIDIA-NeMo
diff --git a/‎scripts/performance/argument_parser.py‎
Lines changed: 8 additions & 0 deletions b/‎scripts/performance/argument_parser.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎scripts/performance/diffusion/pretrain_flux_12b.py‎
Lines changed: 9 additions & 9 deletions b/‎scripts/performance/diffusion/pretrain_flux_12b.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎scripts/performance/helpers.py‎
Lines changed: 29 additions & 0 deletions b/‎scripts/performance/helpers.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎scripts/performance/llm/finetune_deepseek_v3.py‎
Lines changed: 3 additions & 9 deletions b/‎scripts/performance/llm/finetune_deepseek_v3.py‎
Lines changed: 3 additions & 9 deletions
diff --git a/‎scripts/performance/llm/finetune_llama31_405b.py‎
Lines changed: 9 additions & 9 deletions b/‎scripts/performance/llm/finetune_llama31_405b.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎scripts/performance/llm/finetune_llama3_70b.py‎
Lines changed: 9 additions & 9 deletions b/‎scripts/performance/llm/finetune_llama3_70b.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎scripts/performance/llm/finetune_llama3_8b.py‎
Lines changed: 9 additions & 9 deletions b/‎scripts/performance/llm/finetune_llama3_8b.py‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎scripts/performance/llm/finetune_llama4_e128.py‎
Lines changed: 10 additions & 8 deletions b/‎scripts/performance/llm/finetune_llama4_e128.py‎
Lines changed: 10 additions & 8 deletions
diff --git a/‎scripts/performance/llm/mlperf_lora_llama2_70b.py‎
Lines changed: 3 additions & 9 deletions b/‎scripts/performance/llm/mlperf_lora_llama2_70b.py‎
Lines changed: 3 additions & 9 deletions
diff --git a/‎scripts/performance/llm/pretrain_automodel_llama3_8b.py‎
Lines changed: 3 additions & 9 deletions b/‎scripts/performance/llm/pretrain_automodel_llama3_8b.py‎
Lines changed: 3 additions & 9 deletions
@@ -393,5 +393,13 @@ def list_of_strings(arg):
         required=False,
         default=None,
     )
+    parser.add_argument(
+        "-vb",
+        "--enable_vboost",
+        help="Enable VBoost which steers more power towards tensor cores. Disabled by default",
+        type=bool_arg,
+        required=False,
+        default=None,
+    )
 
     return parser
@@ -17,11 +17,17 @@
 import nemo_run as run
 
 from nemo.collections.diffusion.recipes.flux_12b import pretrain_recipe
-from nemo.lightning.run.plugins import NsysPlugin, PerfEnvPlugin
+from nemo.lightning.run.plugins import NsysPlugin
 
 from ..argument_parser import parse_cli_args
 from ..executors import slurm_executor
-from ..helpers import args_sanity_check, get_user_configs, set_exp_logging_configs, set_primary_perf_configs
+from ..helpers import (
+    args_sanity_check,
+    build_perf_env_plugin,
+    get_user_configs,
+    set_exp_logging_configs,
+    set_primary_perf_configs,
+)
 
 
 def override_recipe_configs(
@@ -106,13 +112,7 @@ def override_recipe_configs(
         nemo_home=args.nemo_home,
     )
 
-    plugins = [
-        PerfEnvPlugin(
-            enable_vboost=True,
-            nccl_pp_comm_chunksize=2097152 if pp_size > 1 else None,
-            gpu_sm100_or_newer=(args.gpu.lower() in ['b200', 'gb200']),
-        ),
-    ]
+    plugins = [build_perf_env_plugin(args, pp_size=pp_size)]
     if args.enable_nsys:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
 
@@ -483,3 +483,32 @@ def args_sanity_check(args: dict) -> None:
         assert args.wandb_key is not None, "wandb logger needs \"wandb_key\""
         assert args.wandb_prj_name is not None, "wandb logger needs \"wandb_prj_name\""
         assert args.wandb_job_name is not None, "wandb logger needs \"wandb_job_name\""
+
+
+def build_perf_env_plugin(args, pp_size: int | None = None, user_buffer_registration: Optional[bool] = None):
+    """
+    Create a PerfEnvPlugin with consistent defaults across scripts.
+
+    - enable_vboost only when gpu is h100
+    - set nccl_pp_comm_chunksize when pipeline parallelism is used
+    - set gpu_sm100_or_newer when gpu is in ['b200', 'gb200']
+
+    Args:
+        args: Parsed CLI args that include `gpu`.
+        pp_size: Pipeline parallel size to decide comm chunk size.
+        user_buffer_registration: Optional flag to enable user buffer registration.
+    """
+    from nemo.lightning.run.plugins import PerfEnvPlugin
+
+    gpu_str = getattr(args, "gpu", "").lower()
+    enable_vboost = args.enable_vboost
+    gpu_sm100_or_newer = gpu_str in ["b200", "gb200"]
+    nccl_pp_comm_chunksize = 2097152 if (pp_size is not None and pp_size > 1) else None
+    user_buf = bool(user_buffer_registration) if user_buffer_registration is not None else False
+
+    return PerfEnvPlugin(
+        enable_vboost=enable_vboost,
+        nccl_pp_comm_chunksize=nccl_pp_comm_chunksize,
+        gpu_sm100_or_newer=gpu_sm100_or_newer,
+        user_buffer_registration=user_buf,
+    )
@@ -22,11 +22,11 @@
 from nemo.collections.nlp.modules.common.tokenizer_utils import get_nmt_tokenizer
 from nemo.lightning.pytorch.callbacks.megatron_enable_experimental_callback import MegatronEnableExperimentalCallback
 from nemo.lightning.pytorch.callbacks.moe_token_drop import MegatronTokenDropCallback
-from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin
+from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_cli_args
 from ..executors import slurm_executor
-from ..helpers import args_sanity_check, get_user_configs, set_primary_perf_configs
+from ..helpers import args_sanity_check, build_perf_env_plugin, get_user_configs, set_primary_perf_configs
 from ..utils import hf_tokenizer, import_ckpt_experiment, isfile_train_pack_metadata
 
 HF_MODEL_URI = "deepseek-ai/DeepSeek-V3-Base"
@@ -167,13 +167,7 @@ def override_recipe_configs(
         network='sharp' if args.use_sharp else None,
     )
 
-    plugins = [
-        PerfEnvPlugin(
-            enable_vboost=True,
-            nccl_pp_comm_chunksize=2097152 if pp_size > 1 else None,
-            gpu_sm100_or_newer=(args.gpu.lower() in ['b200', 'gb200']),
-        )
-    ]
+    plugins = [build_perf_env_plugin(args, pp_size=pp_size)]
     if args.enable_nsys:
         plugins.append(NsysPlugin(start_step=10, end_step=12, gen_shape=True))
     if args.enable_memory_profile:
 
@@ -22,11 +22,17 @@
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
     userbuffers_fp8_h100_h16384_tp4_mbs1_seqlen2048_lora,
 )
-from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin
+from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_cli_args
 from ..executors import slurm_executor
-from ..helpers import args_sanity_check, get_user_configs, set_exp_logging_configs, set_primary_perf_configs
+from ..helpers import (
+    args_sanity_check,
+    build_perf_env_plugin,
+    get_user_configs,
+    set_exp_logging_configs,
+    set_primary_perf_configs,
+)
 from ..utils import (
     get_comm_overlap_callback_idx,
     hf_tokenizer,
@@ -190,13 +196,7 @@ def override_recipe_configs(
         network='sharp' if args.use_sharp else None,
     )
 
-    plugins = [
-        PerfEnvPlugin(
-            enable_vboost=True,
-            nccl_pp_comm_chunksize=2097152 if pp_size > 1 else None,
-            gpu_sm100_or_newer=(args.gpu.lower() in ['b200', 'gb200']),
-        )
-    ]
+    plugins = [build_perf_env_plugin(args, pp_size=pp_size)]
     if args.enable_nsys:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
     if args.enable_memory_profile:
 
@@ -22,11 +22,17 @@
 from nemo.collections.llm.recipes.tp_overlap_configs.userbuffers import (
     userbuffers_fp8_h100_h8192_tp2_mbs1_seqlen4096_lora,
 )
-from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin
+from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_cli_args
 from ..executors import slurm_executor
-from ..helpers import args_sanity_check, get_user_configs, set_exp_logging_configs, set_primary_perf_configs
+from ..helpers import (
+    args_sanity_check,
+    build_perf_env_plugin,
+    get_user_configs,
+    set_exp_logging_configs,
+    set_primary_perf_configs,
+)
 from ..utils import (
     get_comm_overlap_callback_idx,
     hf_tokenizer,
@@ -197,13 +203,7 @@ def override_recipe_configs(
         network='sharp' if args.use_sharp else None,
     )
 
-    plugins = [
-        PerfEnvPlugin(
-            enable_vboost=True,
-            nccl_pp_comm_chunksize=2097152 if pp_size > 1 else None,
-            gpu_sm100_or_newer=(args.gpu.lower() in ['b200', 'gb200']),
-        )
-    ]
+    plugins = [build_perf_env_plugin(args, pp_size=pp_size)]
     if args.enable_nsys:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
     if args.enable_memory_profile:
 
@@ -17,11 +17,17 @@
 import nemo_run as run
 
 from nemo.collections.llm.recipes.llama3_8b import finetune_recipe, model
-from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin
+from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_cli_args
 from ..executors import slurm_executor
-from ..helpers import args_sanity_check, get_user_configs, set_exp_logging_configs, set_primary_perf_configs
+from ..helpers import (
+    args_sanity_check,
+    build_perf_env_plugin,
+    get_user_configs,
+    set_exp_logging_configs,
+    set_primary_perf_configs,
+)
 from ..utils import hf_tokenizer, import_ckpt_experiment, prepare_squad_dataset_experiment
 
 HF_MODEL_URI = "meta-llama/Meta-Llama-3-8B"
@@ -135,13 +141,7 @@ def override_recipe_configs(
         network='sharp' if args.use_sharp else None,
     )
 
-    plugins = [
-        PerfEnvPlugin(
-            enable_vboost=True,
-            nccl_pp_comm_chunksize=2097152 if pp_size > 1 else None,
-            gpu_sm100_or_newer=(args.gpu.lower() in ['b200', 'gb200']),
-        )
-    ]
+    plugins = [build_perf_env_plugin(args, pp_size=pp_size)]
     if args.enable_nsys:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
     if args.enable_memory_profile:
 
@@ -18,9 +18,17 @@
 
 from nemo.collections.llm.recipes.llama4_e128 import finetune_recipe, model
 from nemo.collections.llm.recipes.precision.mixed_precision import bf16_with_fp8_mixed
-from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin
+from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_cli_args
+from ..executors import slurm_executor
+from ..helpers import (
+    args_sanity_check,
+    build_perf_env_plugin,
+    get_user_configs,
+    set_exp_logging_configs,
+    set_primary_perf_configs,
+)
 from ..utils import (
     args_sanity_check,
     get_user_configs,
@@ -162,13 +170,7 @@ def override_recipe_configs(
     )
     exp_name = f"{splitext(basename(__file__))[0]}_{args.compute_dtype}_{exp_config}"
 
-    plugins = [
-        PerfEnvPlugin(
-            enable_vboost=True,
-            nccl_pp_comm_chunksize=2097152 if pp_size > 1 else None,
-            gpu_sm100_or_newer=(args.gpu.lower() in ['b200', 'gb200']),
-        )
-    ]
+    plugins = [build_perf_env_plugin(args, pp_size=pp_size)]
 
     if args.enable_nsys:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
 
@@ -24,11 +24,11 @@
 from nemo.collections.llm.gpt.model.llama import *
 from nemo.lightning.pytorch.callbacks.megatron_comm_overlap import MegatronCommOverlapCallback
 from nemo.lightning.pytorch.optim import CosineAnnealingScheduler
-from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin
+from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_cli_args
 from ..executors import slurm_executor
-from ..helpers import args_sanity_check
+from ..helpers import args_sanity_check, build_perf_env_plugin
 from ..utils import import_ckpt_experiment
 
 NUM_NODES = 1
@@ -345,13 +345,7 @@ def mlperf_lora_llama2_70b_recipe(
 
         recipe.log.wandb = wandb_logger(project=args.wandb_prj_name, name=args.wandb_job_name)
 
-    plugins = [
-        PerfEnvPlugin(
-            enable_vboost=True,
-            nccl_pp_comm_chunksize=2097152 if PP_SIZE > 1 else None,
-            gpu_sm100_or_newer=(args.gpu.lower() in ['b200', 'gb200']),
-        )
-    ]
+    plugins = [build_perf_env_plugin(args, pp_size=PP_SIZE)]
     if args.enable_nsys:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
     if args.enable_memory_profile:
 
@@ -19,11 +19,11 @@
 from nemo import lightning as nl
 from nemo.collections.llm.gpt.data.hf_dataset import HFMockDataModule
 from nemo.collections.llm.recipes import hf_auto_model_for_causal_lm
-from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin, PerfEnvPlugin
+from nemo.lightning.run.plugins import MemoryProfilePlugin, NsysPlugin
 
 from ..argument_parser import parse_cli_args
 from ..executors import slurm_executor
-from ..helpers import args_sanity_check, get_user_configs
+from ..helpers import args_sanity_check, build_perf_env_plugin, get_user_configs
 
 SEQ_LENGTH = 2048
 NUM_GPUS_PER_NODE = 8
@@ -105,13 +105,7 @@ def override_recipe_configs(
         network='sharp' if args.use_sharp else None,
     )
 
-    plugins = [
-        PerfEnvPlugin(
-            enable_vboost=True,
-            nccl_pp_comm_chunksize=2097152 if pp_size > 1 else None,
-            gpu_sm100_or_newer=(args.gpu.lower() in ['b200', 'gb200']),
-        ),
-    ]
+    plugins = [build_perf_env_plugin(args, pp_size=pp_size)]
     if args.enable_nsys:
         plugins.append(NsysPlugin(start_step=5, end_step=6))
     if args.enable_memory_profile: