Merge remote-tracking branch 'upstream/main'

yuanhangsu1986 · yuanhangsu1986 · commit 83b6ac461b18 · 2026-02-11T15:55:35.000-08:00
diff --git a/examples/configs/distillation_math.yaml b/examples/configs/distillation_math.yaml
@@ -155,6 +155,12 @@ policy: &POLICY_BASE
             use_custom_fsdp: false
             data_parallel_sharding_strategy: "optim_grads_params"
 
+        fp8_cfg:
+            enabled: false
+            fp8: "e4m3"
+            fp8_recipe: "blockwise"
+            fp8_param: false
+
     scheduler:
         - name: "torch.optim.lr_scheduler.LinearLR"
           kwargs:
diff --git a/examples/configs/distillation_math_megatron.yaml b/examples/configs/distillation_math_megatron.yaml
@@ -106,6 +106,12 @@ policy: &POLICY_BASE
             use_custom_fsdp: false
             data_parallel_sharding_strategy: "optim_grads_params"
 
+        fp8_cfg:
+            enabled: false
+            fp8: "e4m3"
+            fp8_recipe: "blockwise"
+            fp8_param: false
+
     generation:
         backend: "vllm"
         max_new_tokens: ${..max_total_sequence_length} # refer to local policy/teacher config
diff --git a/examples/configs/dpo.yaml b/examples/configs/dpo.yaml
@@ -177,7 +177,13 @@ policy:
       overlap_param_gather: true
       data_parallel_sharding_strategy: "optim_grads_params"
       use_custom_fsdp: false
-    
+
+    fp8_cfg:
+      enabled: false
+      fp8: "e4m3"
+      fp8_recipe: "blockwise"
+      fp8_param: false
+
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
   shuffle: true
diff --git a/examples/configs/grpo_math_1B.yaml b/examples/configs/grpo_math_1B.yaml
@@ -189,7 +189,11 @@ policy:
       use_custom_fsdp: false
       data_parallel_sharding_strategy: "optim_grads_params"
 
-    fp8_cfg: null
+    fp8_cfg: 
+      enabled: false
+      fp8: "e4m3"
+      fp8_recipe: "blockwise"
+      fp8_param: false
 
     env_vars: null
 
diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml
@@ -33,9 +33,6 @@ policy:
       lr_warmup_init: 5.0e-08
     fp8_cfg:
       enabled: true
-      fp8: e4m3
-      fp8_recipe: blockwise
-      fp8_param: false
     env_vars:
       NVTE_FP8_BLOCK_SCALING_FP32_SCALES: '1'
   generation:
diff --git a/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml b/examples/configs/recipes/llm/grpo-moonlight-16ba3b-4n8g-megatron-fp8-e2e.yaml
@@ -28,9 +28,6 @@ policy:
     apply_rope_fusion: false
     fp8_cfg:
       enabled: true
-      fp8: e4m3
-      fp8_recipe: blockwise
-      fp8_param: false
     optimizer:
       lr: 1.0e-06
       use_precision_aware_optimizer: false
@@ -43,10 +40,9 @@ policy:
       precision: fp8
       use_deep_gemm: true
       gpu_memory_utilization: 0.5
-      quantization_ignored_layer_kws: [
-        a_proj,
-        b_proj
-      ]
+      quantization_ignored_layer_kws:
+      - a_proj
+      - b_proj
 logger:
   monitor_gpus: false
   wandb:
diff --git a/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml b/examples/configs/recipes/llm/performance/grpo-llama3.1-8b-instruct-2n8g-fp8-async-1off.yaml
@@ -5,9 +5,6 @@ policy:
   megatron_cfg:
     fp8_cfg:
       enabled: true
-      fp8: "e4m3"
-      fp8_recipe: "blockwise"
-      fp8_param: false
     env_vars:
       NVTE_FP8_BLOCK_SCALING_FP32_SCALES: "1"
   generation:
diff --git a/examples/configs/rm.yaml b/examples/configs/rm.yaml
@@ -128,6 +128,11 @@ policy:
       overlap_param_gather: false
       data_parallel_sharding_strategy: "optim_grads_params"
 
+    fp8_cfg:
+      enabled: false
+      fp8: "e4m3"
+      fp8_recipe: "blockwise"
+      fp8_param: false
     
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
diff --git a/examples/configs/sft.yaml b/examples/configs/sft.yaml
@@ -175,6 +175,12 @@ policy:
       data_parallel_sharding_strategy: "optim_grads_params"
       use_custom_fsdp: false
 
+    fp8_cfg:
+      enabled: false
+      fp8: "e4m3"
+      fp8_recipe: "blockwise"
+      fp8_param: false
+
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
   add_bos: true
diff --git a/examples/configs/sft_openmathinstruct2_megatron.yaml b/examples/configs/sft_openmathinstruct2_megatron.yaml
@@ -100,14 +100,11 @@ policy:
     env_vars:
       PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:False"
 
-    ## fp8 training currently not supported
-    #fp8_cfg:
-    #  enabled: true
-    #  fp8: hybrid
-    #  fp8_recipe: delayed
-    #  fp8_param: true # false gives the following error: "RuntimeError: /TransformerEngine/transformer_engine/common/gemm/cublaslt_gemm.cu:116 in function CanonicalizeGemmInput: Assertion failed: !is_fp8_dtype(ret.Atype). Input A is missing column-wise usage"
-    #  fp8_dot_product_attention: false #true
-    #  fp8_multi_head_attention: false #true
+    fp8_cfg:
+      enabled: false
+      fp8: "e4m3"
+      fp8_recipe: "blockwise"
+      fp8_param: false
 
   dynamic_batching:
     enabled: false
diff --git a/examples/configs/vlm_grpo_3B.yaml b/examples/configs/vlm_grpo_3B.yaml
@@ -159,6 +159,12 @@ policy:
       use_custom_fsdp: false
       data_parallel_sharding_strategy: "optim_grads_params"
 
+    fp8_cfg:
+      enabled: false
+      fp8: "e4m3"
+      fp8_recipe: "blockwise"
+      fp8_param: false
+
 
   # dynamic_batching improves performance by ensuring logprob and training microbatches
   # have a sufficent number of tokens to maximize GPU utilization. Specifically, variable length
diff --git a/examples/configs/vlm_grpo_3B_megatron.yaml b/examples/configs/vlm_grpo_3B_megatron.yaml
@@ -189,6 +189,11 @@ policy:
       overlap_param_gather: true
       use_custom_fsdp: false
       data_parallel_sharding_strategy: optim_grads_params
+    fp8_cfg:
+      enabled: false
+      fp8: "e4m3"
+      fp8_recipe: "blockwise"
+      fp8_param: false
 data:
   max_input_seq_length: ${policy.max_total_sequence_length}
   shuffle: true
diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
@@ -55,6 +55,7 @@
     batched_message_log_to_flat_message,
     get_keys_from_message_log,
 )
+from nemo_rl.data.utils import extract_necessary_env_names
 from nemo_rl.distributed.batched_data_dict import BatchedDataDict
 from nemo_rl.distributed.ray_actor_environment_registry import get_actor_python_env
 from nemo_rl.distributed.virtual_cluster import ClusterConfig, RayVirtualCluster
@@ -341,12 +342,12 @@ def setup(
     # ==========================
     print("\n▶ Setting up compute cluster...", flush=True)
     colocated_inference = generation_config["colocated"]["enabled"]
-    reward_model_enabled = (
-        "env_name" in data_config and data_config["env_name"] == "reward_model"
-    )
+
+    env_name_list = extract_necessary_env_names(data_config)
+    rm_env_enabled = "reward_model" in env_name_list
 
     total_nodes = cluster_config["num_nodes"]
-    if reward_model_enabled:
+    if rm_env_enabled:
         rm_resource = env_configs["reward_model"]["resources"]
         rm_nodes = rm_resource["num_nodes"]
         rm_gpus_per_node = rm_resource["gpus_per_node"]
@@ -423,15 +424,15 @@ def setup(
             inference_nodes = 1
             # If total_nodes == 1, reward model is also on the same node; otherwise it's on a different node
             reward_gpus_to_subtract = (
-                rm_gpus_per_node if total_nodes == 1 and reward_model_enabled else 0
+                rm_gpus_per_node if total_nodes == 1 and rm_env_enabled else 0
             )
             train_gpus_per_node -= inference_gpus_per_node + reward_gpus_to_subtract
             assert train_gpus_per_node > 0, (
                 "No enough GPUs for training, "
                 f"train_gpus_per_node:{train_gpus_per_node} = cluster_config['gpus_per_node']:{cluster_config['gpus_per_node']} - inference_gpus_per_node:{inference_gpus_per_node}"
                 + (
                     f" - rm_gpus_per_node:{rm_gpus_per_node}"
-                    if total_nodes == 1 and reward_model_enabled
+                    if total_nodes == 1 and rm_env_enabled
                     else ""
                 )
             )
diff --git a/nemo_rl/environments/utils.py b/nemo_rl/environments/utils.py
@@ -18,6 +18,7 @@
 
 from nemo_rl.distributed.ray_actor_environment_registry import get_actor_python_env
 from nemo_rl.environments.interfaces import EnvironmentInterface
+from nemo_rl.utils.venvs import create_local_venv_on_each_node
 
 
 # Environment registry entry schema.
@@ -105,10 +106,21 @@ def create_env(env_name: str, env_config: dict) -> EnvironmentInterface:
     )
     actor_class_fqn = ENV_REGISTRY[env_name]["actor_class_fqn"]
     actor_class = get_object(actor_class_fqn)
+    actor_py_exec = get_actor_python_env(actor_class_fqn)
+    extra_env_vars = {}
+    if actor_py_exec.startswith("uv"):
+        actor_py_exec = create_local_venv_on_each_node(
+            actor_py_exec,
+            actor_class_fqn,
+        )
+        extra_env_vars = {
+            "VIRTUAL_ENV": actor_py_exec,
+            "UV_PROJECT_ENVIRONMENT": actor_py_exec,
+        }
     env = actor_class.options(  # type: ignore # it's wrapped with ray.remote
         runtime_env={
-            "py_executable": get_actor_python_env(actor_class_fqn),
-            "env_vars": dict(os.environ),
+            "py_executable": actor_py_exec,
+            "env_vars": {**dict(os.environ), **extra_env_vars},
         }
     ).remote(env_config)
     return env
diff --git a/nemo_rl/models/megatron/data.py b/nemo_rl/models/megatron/data.py
@@ -547,7 +547,6 @@ def _get_pack_sequence_parameters_for_megatron(
     cp_size = megatron_cfg["context_parallel_size"]
     fp8_cfg = megatron_cfg.get("fp8_cfg", None) or {}
     use_fp8 = fp8_cfg.get("enabled", False)
-    use_blockwise_fp8 = fp8_cfg.get("fp8_recipe", None) == "blockwise"
 
     # individual sequence needs to be splitted to CP domain, and to TP domain when SP is enabled.
     pad_individual_seqs_to_multiple_of = 1
@@ -558,7 +557,11 @@ def _get_pack_sequence_parameters_for_megatron(
 
     # packed sequence length, after splitted to TP and CP domains, needs to be divisible by 128 if using blockwise FP8, and divisible by 16 if using other FP8 recipes.
     if use_fp8:
-        divisor = 128 if use_blockwise_fp8 else 16
+        divisor = 16
+        if fp8_cfg["fp8_recipe"] == "blockwise":
+            divisor = 128
+        elif fp8_cfg["fp8_recipe"] == "mxfp8":
+            divisor = 32
         pad_packed_seq_to_multiple_of = divisor
         if cp_size > 1:
             pad_packed_seq_to_multiple_of *= cp_size * 2
diff --git a/research/template_project/configs/grpo_math_1B.yaml b/research/template_project/configs/grpo_math_1B.yaml
@@ -136,7 +136,11 @@ policy:
       use_custom_fsdp: false
       data_parallel_sharding_strategy: "optim_grads_params"
 
-    fp8_cfg: null
+    fp8_cfg:
+      enabled: false
+      fp8: "e4m3"
+      fp8_recipe: "blockwise"
+      fp8_param: false
 
     env_vars: null
 
diff --git a/tests/functional/L1_Functional_Tests_GPU.sh b/tests/functional/L1_Functional_Tests_GPU.sh
@@ -23,32 +23,38 @@ cd ${PROJECT_ROOT}
 time bash ./tests/functional/grpo_frozen_env.sh
 time bash ./tests/functional/test_frozen_env.sh
 
-time uv run --no-sync bash ./tests/functional/sft.sh
-time uv run --no-sync bash ./tests/functional/sft_resume_diamond.sh
+time uv run --no-sync bash ./tests/functional/distillation.sh
+time uv run --no-sync bash ./tests/functional/distillation_megatron.sh
+time uv run --no-sync bash ./tests/functional/dpo.sh
+time uv run --no-sync bash ./tests/functional/dpo_automodel_lora.sh
+time uv run --no-sync bash ./tests/functional/dpo_megatron.sh
+time uv run --no-sync bash ./tests/functional/eval.sh
+time uv run --no-sync bash ./tests/functional/eval_async.sh
 time uv run --no-sync bash ./tests/functional/grpo.sh
-time uv run --no-sync bash ./tests/functional/prorlv2.sh
 time uv run --no-sync bash ./tests/functional/grpo_async.sh
+time uv run --no-sync bash ./tests/functional/grpo_automodel_lora.sh
 time uv run --no-sync bash ./tests/functional/grpo_automodel_lora_async.sh
 time uv run --no-sync bash ./tests/functional/grpo_automodel_lora_non_colocated.sh
-time uv run --no-sync bash ./tests/functional/grpo_automodel_lora.sh
 time uv run --no-sync bash ./tests/functional/grpo_megatron.sh
 time uv run --no-sync bash ./tests/functional/grpo_megatron_generation.sh
+time uv run --no-sync bash ./tests/functional/grpo_multiple_datasets.sh
 time uv run --no-sync bash ./tests/functional/grpo_multiturn.sh
 time uv run --no-sync bash ./tests/functional/grpo_non_colocated.sh
+time uv run --no-sync bash ./tests/functional/grpo_rm_env.sh
+# Re-enable once SGLang build is fixed.
 # time uv run --no-sync bash ./tests/functional/grpo_sglang.sh
-time uv run --no-sync bash ./tests/functional/grpo_multiple_datasets.sh
-time uv run --no-sync bash ./tests/functional/dpo_automodel_lora.sh
-time uv run --no-sync bash ./tests/functional/dpo.sh
+time uv run --no-sync bash ./tests/functional/prorlv2.sh
 time uv run --no-sync bash ./tests/functional/rm.sh
-time uv run --no-sync bash ./tests/functional/eval.sh
-time uv run --no-sync bash ./tests/functional/eval_async.sh
-time uv run --no-sync bash ./tests/functional/test_mcore_extra_installed_correctly.sh
-time uv run --no-sync bash ./tests/functional/test_automodel_extra_installed_correctly.sh
-time uv run --no-sync bash ./tests/functional/vlm_grpo.sh
-time uv run --no-sync bash ./tests/functional/distillation.sh
-time uv run --no-sync bash ./tests/functional/distillation_megatron.sh
+time uv run --no-sync bash ./tests/functional/sft.sh
 time uv run --no-sync bash ./tests/functional/sft_automodel_lora.sh
+time uv run --no-sync bash ./tests/functional/sft_megatron.sh
 time uv run --no-sync bash ./tests/functional/sft_megatron_lora.sh
+time uv run --no-sync bash ./tests/functional/sft_resume_diamond.sh
+time uv run --no-sync bash ./tests/functional/test_automodel_extra_installed_correctly.sh
+# Re-enable once DTensor v2 converter is fixed.
+# time uv run --no-sync bash ./tests/functional/test_converters.sh
+time uv run --no-sync bash ./tests/functional/test_mcore_extra_installed_correctly.sh
+time uv run --no-sync bash ./tests/functional/vlm_grpo.sh
 
 # Research functional tests (self-discovery)
 for test_script in research/*/tests/functional/*.sh; do
diff --git a/tests/functional/dpo_megatron.sh b/tests/functional/dpo_megatron.sh
@@ -22,24 +22,26 @@ mkdir -p $EXP_DIR $LOG_DIR
 
 cd $PROJECT_ROOT
 uv run $PROJECT_ROOT/examples/run_dpo.py \
-    --config $PROJECT_ROOT/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.yaml \
+    --config $PROJECT_ROOT/examples/configs/recipes/llm/dpo-llama3.1-8b-instruct-4n8g-megatron.v2.yaml \
     policy.model_name=Qwen/Qwen3-0.6B \
-    cluster.gpus_per_node=2 \
     dpo.max_num_steps=3 \
     dpo.val_batches=1 \
     dpo.val_period=3 \
+    policy.train_global_batch_size=8 \
+    policy.megatron_cfg.tensor_model_parallel_size=1 \
+    policy.megatron_cfg.sequence_parallel=false \
     logger.tensorboard_enabled=true \
     logger.log_dir=$LOG_DIR \
     logger.wandb_enabled=false \
     logger.monitor_gpus=true \
     checkpointing.enabled=false \
-    policy.megatron_cfg.tensor_model_parallel_size=1 \
-    policy.train_global_batch_size=8 \
+    cluster.gpus_per_node=2 \
+    cluster.num_nodes=1 \
     $@ \
     2>&1 | tee $RUN_LOG
 
 uv run tests/json_dump_tb_logs.py $LOG_DIR --output_path $JSON_METRICS
 
 uv run tests/check_metrics.py $JSON_METRICS \
-  'data["train/loss"]["3"] < 5' \
+  'data["train/loss"]["3"] < 6' \
 
diff --git a/tests/functional/grpo_math_env.sh b/tests/functional/grpo_math_env.sh
diff --git a/tests/unit/models/megatron/test_megatron_data.py b/tests/unit/models/megatron/test_megatron_data.py
diff --git a/tests/unit/test_recipes_and_test_suites.py b/tests/unit/test_recipes_and_test_suites.py