NVIDIA-NeMo
diff --git a/‎.github/workflows/cicd-main.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/cicd-main.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion b/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎3rdparty/Automodel-workspace/Automodel‎ b/‎3rdparty/Automodel-workspace/Automodel‎
diff --git a/‎…1-8b-instruct-1n8g-megatron-fp8-e2e.yaml‎ ‎…1-8b-instruct-2n8g-megatron-fp8-e2e.yaml‎examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-e2e.yaml renamed to examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml
Lines changed: 4 additions & 3 deletions b/‎…1-8b-instruct-1n8g-megatron-fp8-e2e.yaml‎ ‎…1-8b-instruct-2n8g-megatron-fp8-e2e.yaml‎examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-e2e.yaml renamed to examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e.yaml
Lines changed: 4 additions & 3 deletions
diff --git a/‎nemo_rl/models/generation/fp8.py‎
Lines changed: 35 additions & 2 deletions b/‎nemo_rl/models/generation/fp8.py‎
Lines changed: 35 additions & 2 deletions
diff --git a/‎nemo_rl/models/generation/vllm/vllm_worker.py‎
Lines changed: 116 additions & 44 deletions b/‎nemo_rl/models/generation/vllm/vllm_worker.py‎
Lines changed: 116 additions & 44 deletions
diff --git a/‎nemo_rl/models/generation/vllm/vllm_worker_async.py‎
Lines changed: 0 additions & 6 deletions b/‎nemo_rl/models/generation/vllm/vllm_worker_async.py‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py‎
Lines changed: 1 addition & 1 deletion b/‎nemo_rl/models/policy/workers/dtensor_policy_worker_v2.py‎
Lines changed: 1 addition & 1 deletion
@@ -208,7 +208,7 @@ jobs:
       build-contexts: |
         nemo-rl=${{ github.run_id }}/
       build-args: |
-        MAX_JOBS=32
+        MAX_JOBS=4
         NEMO_RL_COMMIT=${{ github.sha }}
 
   cicd-doc-tests:
 
@@ -11,7 +11,7 @@
 [submodule "3rdparty/Automodel-workspace/Automodel"]
 	path = 3rdparty/Automodel-workspace/Automodel
 	url = https://github.com/NVIDIA-NeMo/Automodel.git
-	branch = nemo-rl-submodule
+	branch = yifu/bump-torch-and-hf
 	shallow = true
 [submodule "3rdparty/Gym-workspace/Gym"]
 	path = 3rdparty/Gym-workspace/Gym
 
@@ -6,7 +6,7 @@ grpo:
 loss_fn:
   use_importance_sampling_correction: true
 checkpointing:
-  checkpoint_dir: results/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-e2e
+  checkpoint_dir: results/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e
 policy:
   model_name: meta-llama/Llama-3.1-8B-Instruct
   tokenizer:
@@ -48,11 +48,12 @@ policy:
 data:
   max_input_seq_length: 4096
 logger:
-  log_dir: logs/grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-e2e
+  log_dir: logs/grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e
   wandb_enabled: true
   tensorboard_enabled: true
   wandb:
     project: nemo-rl
-    name: grpo-llama3.1-8b-instruct-1n8g-megatron-fp8-e2e
+    name: grpo-llama3.1-8b-instruct-2n8g-megatron-fp8-e2e
 cluster:
+  num_nodes: 2
   gpus_per_node: 8
@@ -272,6 +272,7 @@ def init_fp8(vllm_cfg, model_name, model_parallel_size):
 
     if vllm_cfg.get("use_deep_gemm", False):
         os.environ["VLLM_USE_DEEP_GEMM"] = "1"
+        os.environ["VLLM_USE_DEEP_GEMM_E8M0"] = "0"
 
     if vllm_cfg["async_engine"]:
         # for async engine, vllm spawns a process for each DP, so we patch
@@ -541,14 +542,46 @@ def cast_tensor_to_fp8_blockwise(
     return fp_data, descale_fp
 
 
+# Ref: https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/vllm/model_executor/layers/quantization/utils/fp8_utils.py#L1175
+# Patches this method to not create new torch.nn.Parameter for layer weights
+# to maintain weight loaders.
+def maybe_post_process_fp8_weight_block(layer: torch.nn.Module):
+    assert layer.weight_block_size is not None
+
+    from vllm.model_executor.layers.quantization.utils.fp8_utils import (
+        deepgemm_post_process_fp8_weight_block,
+    )
+    from vllm.utils.deep_gemm import (
+        is_deep_gemm_e8m0_used,
+        should_use_deepgemm_for_fp8_linear,
+    )
+
+    # On Blackwell or Hopper, if E8M0 for DeepGemm is used, we need to
+    # requantize the weight and input to the specific scale
+    # at the same time.
+    should_use_deepgemm = should_use_deepgemm_for_fp8_linear(
+        layer.orig_dtype, layer.weight
+    )
+    if should_use_deepgemm:
+        dg_weight, dg_weight_scale = deepgemm_post_process_fp8_weight_block(
+            wq=layer.weight.data,
+            ws=layer.weight_scale.data,
+            quant_block_shape=tuple(layer.weight_block_size),
+            use_e8m0=is_deep_gemm_e8m0_used(),
+        )
+        # This is the only part we change from the original function (https://github.com/vllm-project/vllm/blob/275de34170654274616082721348b7edd9741d32/vllm/model_executor/layers/quantization/utils/fp8_utils.py#L1196-L1197)
+        # Instead of creating new torch.nn.Parameter, we update the data in place.
+        layer.weight.data.copy_(dg_weight)
+        layer.weight_scale.data.copy_(dg_weight_scale)
+
+
 def process_weights_after_loading(self, layer) -> None:
     """This function is used to process the weights after loading for a Linear layer.
 
     Compared to the original process_weights_after_loading in vllm, we just avoid creation of
     new torch.nn.Parameter objects, because that removes the weight_loader attribute which we need for refit.
     """
     from vllm.model_executor.layers.quantization.utils.fp8_utils import (
-        maybe_post_process_fp8_weight_block,
         process_fp8_weight_block_strategy,
     )
 
@@ -566,7 +599,7 @@ def process_weights_after_loading(self, layer) -> None:
         layer.weight_scale = torch.nn.Parameter(weight_scale.data, requires_grad=False)
         layer.update_param_tp_status()
 
-    maybe_post_process_fp8_weight_block(layer, self.cutlass_block_fp8_supported)
+    maybe_post_process_fp8_weight_block(layer)
 
 
 def process_weights_after_loading_moe(self, layer) -> None:
 
@@ -16,6 +16,7 @@
 import gc
 import os
 import sys
+from importlib.util import find_spec
 from typing import Any, Optional, cast
 
 import ray
@@ -157,63 +158,134 @@ def __init__(
         self.rank = 0
         self.world_size = 1
 
-        # Monkey patch for vLLM to ensure RAY_ADDRESS is set in Ray actors.
-        try:
-            from vllm.logger import init_logger
+        # Monkey patches for vLLM behavior. We avoid importing vllm modules
+        # here to prevent side effects during initialization and instead
+        # locate the files via importlib metadata.
 
-            logger = init_logger("vllm_patch")
+        from vllm.logger import init_logger
 
-            def _patch_vllm_init_workers_ray():
-                """Patch the vLLM ray_distributed_executor.py file.
+        logger = init_logger("vllm_patch")
 
-                1. Pass custom runtime_env in _init_workers_ray call.
-                    - This allows passing custom py_executable to worker initialization.
-                2. Add NCCL_CUMEM_ENABLE and NCCL_NVLS_ENABLE to vLLM ADDITIONAL_ENV_VARS.
-                    - This is a workaround to fix async vllm in some scenarios.
-                    - See https://github.com/NVIDIA-NeMo/RL/pull/898 for more details.
-                """
-                try:
-                    import vllm.executor.ray_distributed_executor as ray_executor_module
+        def _get_vllm_file(relative_path: str) -> str:
+            """Return absolute path to a vLLM file or raise if it cannot be found.
+
+            The relative_path should be a POSIX-style path under the vllm
+            package root, e.g. "v1/executor/ray_executor.py" or
+            "attention/layer.py".
+            """
+            spec = find_spec("vllm")
+            if spec is None or not spec.submodule_search_locations:
+                raise RuntimeError(
+                    "vLLM package not found while attempting to patch "
+                    f"'{relative_path}'. Ensure vLLM is installed and "
+                    "available in this environment."
+                )
 
-                    file_to_patch = ray_executor_module.__file__
+            base_dir = next(iter(spec.submodule_search_locations))
+            file_path = os.path.join(base_dir, *relative_path.split("/"))
 
-                    with open(file_to_patch, "r") as f:
-                        content = f.read()
+            if not os.path.exists(file_path):
+                raise RuntimeError(
+                    "Failed to locate expected vLLM file to patch. "
+                    f"Looked for '{relative_path}' at '{file_path}'. "
+                    "This likely indicates an unexpected vLLM installation "
+                    "layout or version mismatch."
+                )
+
+            return file_path
+
+        def _patch_vllm_init_workers_ray():
+            """Patch the vLLM ray_distributed_executor.py file.
+
+            1. Pass custom runtime_env in _init_workers_ray call.
+                - This allows passing custom py_executable to worker initialization.
+            2. Add NCCL_CUMEM_ENABLE and NCCL_NVLS_ENABLE to vLLM ADDITIONAL_ENV_VARS.
+                - This is a workaround to fix async vllm in some scenarios.
+                - See https://github.com/NVIDIA-NeMo/RL/pull/898 for more details.
+            """
+            file_to_patch = _get_vllm_file("v1/executor/ray_executor.py")
 
-                    old_lines = [
-                        "self._init_workers_ray(placement_group)",
-                        'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"}',
-                    ]
+            with open(file_to_patch, "r") as f:
+                content = f.read()
 
-                    new_lines = [
-                        f'self._init_workers_ray(placement_group, runtime_env={{"py_executable": "{self.py_executable}"}})',
-                        'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN", "NCCL_CUMEM_ENABLE", "NCCL_NVLS_ENABLE", "RAY_ENABLE_UV_RUN_RUNTIME_ENV"}',
-                    ]
+            old_lines = [
+                "self._init_workers_ray(placement_group)",
+                'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN"}',
+            ]
 
-                    need_replace = False
-                    for old_line, new_line in zip(old_lines, new_lines):
-                        if new_line in content or old_line not in content:
-                            continue
-                        content = content.replace(old_line, new_line)
-                        need_replace = True
+            new_lines = [
+                f'self._init_workers_ray(placement_group, runtime_env={{"py_executable": "{self.py_executable}"}})',
+                'ADDITIONAL_ENV_VARS = {"HF_TOKEN", "HUGGING_FACE_HUB_TOKEN", "NCCL_CUMEM_ENABLE", "NCCL_NVLS_ENABLE", "RAY_ENABLE_UV_RUN_RUNTIME_ENV"}',
+            ]
+
+            need_replace = False
+            for old_line, new_line in zip(old_lines, new_lines):
+                if new_line in content or old_line not in content:
+                    continue
+                content = content.replace(old_line, new_line)
+                need_replace = True
+
+            if not need_replace:
+                return
+
+            # Write back the patched content
+            with open(file_to_patch, "w") as f:
+                f.write(content)
+
+        def _patch_vllm_vit_flash_attn_backend():
+            """Patch vLLM vision attention backend selection logic.
+
+            Modify the CUDA branch of maybe_get_vit_flash_attn_backend in
+            vllm.attention.layer to avoid overriding the backend when it
+            is already set to XFORMERS. This avoids flash attention related
+            errors when the ViT head dimension is not a multiple of 32.
+
+            Related issues:
+            - https://github.com/vllm-project/vllm/issues/27562
+            - https://github.com/vllm-project/vllm/issues/26989
+
+            This is properly fixed in https://github.com/vllm-project/vllm/pull/28763. We can remove this patch once we upgrade to a version of vllm that contains this fix.
+            """
+            file_to_patch = _get_vllm_file("attention/layer.py")
+            with open(file_to_patch, "r") as f:
+                content = f.read()
+
+            old_snippet = (
+                "    elif current_platform.is_cuda():\n"
+                "        if (\n"
+                "            attn_backend != AttentionBackendEnum.FLASH_ATTN\n"
+                "            and check_upstream_fa_availability(torch.get_default_dtype())\n"
+                "        ):\n"
+                "            attn_backend = AttentionBackendEnum.FLASH_ATTN\n"
+                "            use_upstream_fa = True\n"
+            )
+
+            new_snippet = (
+                "    elif current_platform.is_cuda():\n"
+                "        if (\n"
+                "            attn_backend != AttentionBackendEnum.FLASH_ATTN\n"
+                "            and attn_backend != AttentionBackendEnum.XFORMERS\n"
+                "            and check_upstream_fa_availability(torch.get_default_dtype())\n"
+                "        ):\n"
+                "            attn_backend = AttentionBackendEnum.FLASH_ATTN\n"
+                "            use_upstream_fa = True\n"
+            )
 
-                    if not need_replace:
-                        return
+            # Only patch if the file still has the old snippet and
+            # hasn't been patched already.
+            if new_snippet in content or old_snippet not in content:
+                return
 
-                    # Write back the patched content
-                    with open(file_to_patch, "w") as f:
-                        f.write(content)
+            content = content.replace(old_snippet, new_snippet)
 
-                except (ImportError, FileNotFoundError, PermissionError):
-                    # Allow failures gracefully
-                    pass
+            with open(file_to_patch, "w") as f:
+                f.write(content)
 
-            _patch_vllm_init_workers_ray()
-            logger.info("Successfully patched vllm _init_workers_ray.")
+        _patch_vllm_init_workers_ray()
+        logger.info("Successfully patched vllm _init_workers_ray.")
 
-        except (ImportError, AttributeError):
-            # vllm not installed or has a different structure, skipping patch.
-            pass
+        _patch_vllm_vit_flash_attn_backend()
+        logger.info("Successfully patched vllm vit flash attention backend.")
 
         try:
             import vllm
 
@@ -471,9 +471,6 @@ class NeMoRLOpenAIServingChat(NeMoRLOpenAIServingMixin, OpenAIServingChat):
                 return_tokens_as_token_ids=True,
             )
         )
-        # Remove this fork when https://github.com/NVIDIA-NeMo/RL/pull/1563 is merged to NeMo RL main bumping to vLLM 0.11.2
-        if vllm_version < "0.11.1":
-            serving_chat_kwargs["model_config"] = model_config
         openai_serving_chat = NeMoRLOpenAIServingChat(**serving_chat_kwargs)
 
         generation_config = self.cfg
@@ -538,9 +535,6 @@ class NeMoRLOpenAIServingTokenization(
             engine_client=serving_chat_kwargs["engine_client"],
             models=serving_chat_kwargs["models"],
         )
-        # Remove this fork when https://github.com/NVIDIA-NeMo/RL/pull/1563 is merged to NeMo RL main bumping to vLLM 0.11.2
-        if vllm_version < "0.11.1":
-            serving_tokenization_kwargs["model_config"] = model_config
         openai_serving_tokenization = NeMoRLOpenAIServingTokenization(
             **serving_tokenization_kwargs
         )
 
@@ -1848,7 +1848,7 @@ def move_buffer_to_device(
     ) -> nn.Module:
         # FSDP modules do not move buffers to the device automatically
         for v in model.buffers():
-            v.data = v.data.to(device)
+            v = v.to(device)
 
         return model
Original file line number	Diff line number	Diff line change
`@@ -471,9 +471,6 @@ class NeMoRLOpenAIServingChat(NeMoRLOpenAIServingMixin, OpenAIServingChat):`
`471`	`471`	`return_tokens_as_token_ids=True,`
`472`	`472`	`)`
`473`	`473`	`)`
`474`		`- # Remove this fork when https://github.com/NVIDIA-NeMo/RL/pull/1563 is merged to NeMo RL main bumping to vLLM 0.11.2`
`475`		`- if vllm_version < "0.11.1":`
`476`		`- serving_chat_kwargs["model_config"] = model_config`
`477`	`474`	`openai_serving_chat = NeMoRLOpenAIServingChat(**serving_chat_kwargs)`
`478`	`475`
`479`	`476`	`generation_config = self.cfg`
`@@ -538,9 +535,6 @@ class NeMoRLOpenAIServingTokenization(`
`538`	`535`	`engine_client=serving_chat_kwargs["engine_client"],`
`539`	`536`	`models=serving_chat_kwargs["models"],`
`540`	`537`	`)`
`541`		`- # Remove this fork when https://github.com/NVIDIA-NeMo/RL/pull/1563 is merged to NeMo RL main bumping to vLLM 0.11.2`
`542`		`- if vllm_version < "0.11.1":`
`543`		`- serving_tokenization_kwargs["model_config"] = model_config`
`544`	`538`	`openai_serving_tokenization = NeMoRLOpenAIServingTokenization(`
`545`	`539`	`**serving_tokenization_kwargs`
`546`	`540`	`)`