feat: Integrate vlm changes between DTensorPolicyWorker V1 and V2. (#982)

ffrujeri · web-flow · commit 76874a89b85d · 2025-09-05T08:07:48.000Z
Signed-off-by: Felipe Vieira Frujeri &lt;ffrujeri@nvidia.com&gt;
diff --git a/3rdparty/Automodel-workspace/Automodel b/3rdparty/Automodel-workspace/Automodel
@@ -1 +1 @@
-Subproject commit 256f74c8d3bc12cc789488e72cf3b1d05601a955
+Subproject commit 71162c284d315193cbb4011081228da2ba943c27
diff --git a/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml b/examples/configs/recipes/vlm/vlm_grpo-qwen2.5-vl-3b-instruct-clevr-1n2g-dtensor2tp1.v1.yaml
@@ -44,6 +44,7 @@ policy:
   precision: "bfloat16"
 
   dtensor_cfg:
+    _v2: true
     enabled: true
     cpu_offload: False
     sequence_parallel: false
diff --git a/examples/configs/vlm_grpo_3B.yaml b/examples/configs/vlm_grpo_3B.yaml
@@ -45,6 +45,7 @@ policy:
   precision: "bfloat16"
 
   dtensor_cfg:
+    _v2: true
     enabled: true
     cpu_offload: False
     sequence_parallel: false
diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py
@@ -23,10 +23,11 @@
 import torch
 from accelerate import init_empty_weights
 from nemo_automodel import (
-    NeMoAutoModelForCausalLM,
     NeMoAutoModelForSequenceClassification,
 )
-from nemo_automodel.components._transformers.utils import sliding_window_overwrite
+from nemo_automodel.components._transformers.utils import (
+    sliding_window_overwrite,
+)
 from nemo_automodel.components.distributed.cp_utils import (
     create_context_parallel_ctx,
     get_train_context,
@@ -56,6 +57,7 @@
 from torch.distributed.tensor import DTensor, Shard
 from transformers import (
     AutoConfig,
+    AutoProcessor,
     AutoTokenizer,
 )
 from transformers.models.gemma3.modeling_gemma3 import Gemma3ForCausalLM
@@ -79,6 +81,7 @@
     get_handle_from_tensor,
     get_runtime_env_for_policy_worker,
     import_class_from_path,
+    resolve_model_class,
 )
 from nemo_rl.utils.native_checkpoint import (
     load_checkpoint,
@@ -105,12 +108,19 @@ def __init__(
         self,
         config: PolicyConfig,
         tokenizer: AutoTokenizer,
+        processor: Optional[AutoProcessor] = None,
         weights_path: Optional[str] = None,
         optimizer_path: Optional[str] = None,
         init_optimizer: bool = True,
         init_reference_model: bool = True,
         **kwargs: Any,
     ):
+        self.tokenizer = tokenizer
+        self.processor = processor
+        self.is_vlm = processor is not None
+
+        print(f"Initializing DTensorPolicyWorkerV2 with is_vlm={self.is_vlm}")
+
         self.is_generation_colocated = None
         if "generation" in config and config["generation"] is not None:
             self.is_generation_colocated = config["generation"]["colocated"]["enabled"]
@@ -146,6 +156,9 @@ def __init__(
         print(f"[Rank {self.rank}] Loading model {model_name} on CPU...")
         self.enable_seq_packing = self.cfg["sequence_packing"]["enabled"]
         if self.enable_seq_packing:
+            assert not self.is_vlm, (
+                "Sequence packing is not supported for VLM models. Please set policy.sequence_packing.enabled = False to train VLM models."
+            )
             print(
                 f"[Rank {self.rank}] Sequence packing is enabled for model {model_name}"
             )
@@ -195,7 +208,8 @@ def __init__(
             else:
                 raise ValueError(f"Unknown reward model type: {rm_type}")
         else:
-            model_class = NeMoAutoModelForCausalLM
+            # DO NOT assume AutoModelForCausalLM, multimodal models can inherit from AutoModelForImageTextToText, AutoModelForTextToWaveform, etc.
+            model_class = resolve_model_class(model_config.model_type)
 
         full_state_dict = None
         if self.rank == 0:
@@ -205,6 +219,7 @@ def __init__(
                 device_map="cpu",  # load weights onto CPU initially
                 trust_remote_code=True,
                 config=model_config,
+                torch_dtype=str(model_config.torch_dtype),
             )
 
             full_state_dict = model.state_dict()
@@ -224,19 +239,12 @@ def __init__(
                 if self.enable_seq_packing
                 else None,
                 trust_remote_code=True,
+                torch_dtype=str(model_config.torch_dtype),
             )
 
         if self.model.config.pad_token_id is None:
             self.model.config.pad_token_id = tokenizer.pad_token_id
 
-        # caching since this property is not always preserved after FSDP
-        self.tokenizer = tokenizer
-
-        # ------------------------------------------------
-        # 3) Move to GPU + Composable FSDP
-        #    (Initialize device mesh, shard submodules, then shard entire model)
-        # ------------------------------------------------
-
         tp_size = self.cfg["dtensor_cfg"]["tensor_parallel_size"]
         cp_size = self.cfg["dtensor_cfg"]["context_parallel_size"]
         if cp_size > 1 and self.enable_seq_packing:
@@ -266,6 +274,10 @@ def __init__(
                 "See https://github.com/NVIDIA-NeMo/RL/issues/659 for more details."
             )
 
+            assert not self.is_vlm, (
+                "Context parallel is yet not supported for VLM models. Please set cp_size = 1 to train VLM models."
+            )
+
         # For FSDP2 compatibility, we need to support HSDP structure
         # For now, we use dp_replicate_size = 1 (no hybrid sharding)
         dp_replicate_size = 1
@@ -299,6 +311,10 @@ def __init__(
         self.cp_size = cp_size
         self.device_mesh = device_mesh
 
+        # ------------------------------------------------
+        # 3) Move to GPU + Composable FSDP
+        #    (Initialize device mesh, shard submodules, then shard entire model)
+        # ------------------------------------------------
         self.model = fsdp2_strategy_parallelize(
             self.model,
             device_mesh=self.device_mesh,
@@ -597,8 +613,18 @@ def train(
                             ).repeat(batch_size, 1)
                             flash_attn_kwargs = {}
 
+                        # add vlm kwargs to model call
+                        vlm_kwargs = mb.get_multimodal_dict(
+                            as_tensors=True, device=input_ids.device
+                        )
+                        if len(vlm_kwargs) > 0:
+                            position_ids = None
+
                     context_parallel_ctx = None
                     if self.cp_size > 1:
+                        assert len(vlm_kwargs) == 0, (
+                            f"multimodal kwargs={vlm_kwargs} are not supported for context parallel"
+                        )
                         seq_index = torch.arange(
                             seq_len, device=input_ids.device
                         ).repeat(1, 1)
@@ -624,6 +650,7 @@ def train(
                                 position_ids=position_ids,
                                 use_cache=False,
                                 flash_attn_kwargs=flash_attn_kwargs,
+                                **vlm_kwargs,
                             )
 
                             if self._is_reward_model:
@@ -632,6 +659,9 @@ def train(
                                 # is not supported for reward models.
                                 assert not flash_attn_kwargs
                                 del model_args["flash_attn_kwargs"]
+                            # remove flash_attn_kwargs if there are multimodal kwargs
+                            if len(vlm_kwargs) > 0:
+                                del model_args["flash_attn_kwargs"]
 
                             outputs = self.model(**model_args)
 
@@ -859,9 +889,15 @@ def get_logprobs(
                 step += 1
                 input_ids = lp_batch.get("input_ids").cuda()
                 input_lengths = lp_batch.get("input_lengths")
+                vlm_kwargs = lp_batch.get_multimodal_dict(
+                    as_tensors=True, device=input_ids.device
+                )
 
                 batch_size, seq_len = input_ids.shape
                 if self.enable_seq_packing:
+                    assert len(vlm_kwargs) == 0, (
+                        "multimodal kwargs are not supported for sequence packing"
+                    )
                     input_ids, position_ids, _ = pack_sequences(
                         input_ids=input_ids,
                         input_lengths=input_lengths,
@@ -901,8 +937,15 @@ def get_logprobs(
                         (batch_size, seq_len), dtype=torch.long, device=input_ids.device
                     )
 
+                # if there are multimodal kwargs, we don't need to add position_ids (computed internally)
+                if len(vlm_kwargs) > 0:
+                    position_ids = None
+
                 context_parallel_ctx = None
                 if self.cp_size > 1:
+                    assert len(vlm_kwargs) == 0, (
+                        "multimodal kwargs are not supported for context parallel"
+                    )
                     seq_index = torch.arange(seq_len, device=input_ids.device).repeat(
                         1, 1
                     )
@@ -918,13 +961,18 @@ def get_logprobs(
 
                 with get_train_context(False, False, context_parallel_ctx)():
                     with torch.autocast(device_type="cuda", dtype=self.dtype):
-                        outputs = self.model(
+                        model_args = dict(
                             input_ids=input_ids,
                             attention_mask=attention_mask_input_all_ones,
                             position_ids=position_ids,
                             use_cache=False,
                             flash_attn_kwargs=flash_attn_kwargs,
+                            **vlm_kwargs,
                         )
+                        if len(vlm_kwargs) > 0:
+                            del model_args["flash_attn_kwargs"]
+
+                        outputs = self.model(**model_args)
 
                     logits = outputs.logits
 
diff --git a/nemo_rl/models/policy/utils.py b/nemo_rl/models/policy/utils.py
@@ -14,37 +14,67 @@
 
 import importlib
 import os
-from collections import defaultdict
-from typing import Any
+from typing import Any, Dict
 
 import torch
-from torch import nn
 from transformers import (
     AutoConfig,
     AutoModelForCausalLM,
     AutoModelForImageTextToText,
     AutoModelForTextToWaveform,
 )
 
+# Try to import nemo_automodel classes, fallback to None if not available
+try:
+    from nemo_automodel.components._transformers.auto_model import (
+        NeMoAutoModelForCausalLM,
+        NeMoAutoModelForImageTextToText,
+        NeMoAutoModelForTextToWaveform,
+    )
+
+    NEMO_AUTOMODEL_AVAILABLE = True
+except ImportError:
+    # nemo_automodel is not installed, classes will be None
+    NeMoAutoModelForCausalLM = None  # type: ignore
+    NeMoAutoModelForImageTextToText = None  # type: ignore
+    NeMoAutoModelForTextToWaveform = None  # type: ignore
+    NEMO_AUTOMODEL_AVAILABLE = False
+
 from nemo_rl.distributed.worker_group_utils import get_nsight_config_if_pattern_matches
 
 # an automodel factory for loading the huggingface models from correct class
-AUTOMODEL_FACTORY = defaultdict(lambda: AutoModelForCausalLM)
-AUTOMODEL_FACTORY["qwen2_5_vl"] = AutoModelForImageTextToText
-AUTOMODEL_FACTORY["qwen2_vl"] = AutoModelForImageTextToText
-AUTOMODEL_FACTORY["qwen2_5_omni"] = AutoModelForTextToWaveform
-AUTOMODEL_FACTORY["llava"] = AutoModelForImageTextToText
-AUTOMODEL_FACTORY["internvl"] = AutoModelForImageTextToText
-AUTOMODEL_FACTORY["gemma3"] = AutoModelForImageTextToText
-AUTOMODEL_FACTORY["smolvlm"] = AutoModelForImageTextToText
-AUTOMODEL_FACTORY["mistral3"] = AutoModelForImageTextToText
-AUTOMODEL_FACTORY["llama4"] = AutoModelForImageTextToText
-
-
-def resolve_model_class(model_name: str) -> nn.Module:
-    if model_name.lower() in AUTOMODEL_FACTORY.keys():
-        return AUTOMODEL_FACTORY[model_name.lower()]
-    return AutoModelForCausalLM
+
+AUTOMODEL_FACTORY: Dict[str, Any] = {
+    "qwen2_5_vl": AutoModelForImageTextToText,
+    "qwen2_vl": AutoModelForImageTextToText,
+    "qwen2_5_omni": AutoModelForTextToWaveform,
+    "llava": AutoModelForImageTextToText,
+    "internvl": AutoModelForImageTextToText,
+    "gemma3": AutoModelForImageTextToText,
+    "smolvlm": AutoModelForImageTextToText,
+    "mistral3": AutoModelForImageTextToText,
+    "llama4": AutoModelForImageTextToText,
+}
+
+if NEMO_AUTOMODEL_AVAILABLE:
+    AUTOMODEL_FACTORY = {
+        "qwen2_5_vl": NeMoAutoModelForImageTextToText,
+        "qwen2_vl": NeMoAutoModelForImageTextToText,
+        "qwen2_5_omni": NeMoAutoModelForTextToWaveform,
+        "llava": NeMoAutoModelForImageTextToText,
+        "internvl": NeMoAutoModelForImageTextToText,
+        "gemma3": NeMoAutoModelForImageTextToText,
+        "smolvlm": NeMoAutoModelForImageTextToText,
+        "mistral3": NeMoAutoModelForImageTextToText,
+        "llama4": NeMoAutoModelForImageTextToText,
+    }
+
+
+def resolve_model_class(model_name: str) -> Any:
+    """Resolve the appropriate model class for a given model name."""
+    if NEMO_AUTOMODEL_AVAILABLE:
+        return AUTOMODEL_FACTORY.get(model_name.lower(), NeMoAutoModelForCausalLM)
+    return AUTOMODEL_FACTORY.get(model_name.lower(), AutoModelForCausalLM)
 
 
 def is_vllm_v1_engine_enabled() -> bool:
diff --git a/pyproject.toml b/pyproject.toml
@@ -144,13 +144,16 @@ megatron-bridge = { workspace = true }
 nemo_run = { git = "https://github.com/NVIDIA-NeMo/Run", rev = "414f0077c648fde2c71bb1186e97ccbf96d6844c" }
 # torch/torchvision/triton all come from the torch index in order to pick up aarch64 wheels
 torch = [
-  { index = "pytorch-cu128" },
+  { index = "pytorch-cu128", marker = "sys_platform != 'darwin'" },
+  { index = "pypi", marker = "sys_platform == 'darwin'" },
 ]
 torchvision = [
-  { index = "pytorch-cu128" },
+  { index = "pytorch-cu128", marker = "sys_platform != 'darwin'" },
+  { index = "pypi", marker = "sys_platform == 'darwin'" },
 ]
 triton = [
-  { index = "pytorch-cu128" },
+  { index = "pytorch-cu128", marker = "sys_platform != 'darwin'" },
+  { index = "pypi", marker = "sys_platform == 'darwin'" },
 ]
 causal-conv1d = { git = "https://github.com/Dao-AILab/causal-conv1d", tag = "v1.5.0.post8" }
 mamba-ssm = { git = "https://github.com/state-spaces/mamba.git", rev = "2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" }
@@ -162,6 +165,11 @@ members = [
     "3rdparty/Megatron-Bridge-workspace",
 ]
 
+[[tool.uv.index]]
+name = "pypi"
+url = "https://pypi.org/simple"
+explicit = true
+
 [[tool.uv.index]]
 name = "pytorch-cu128"
 url = "https://download.pytorch.org/whl/cu128"
diff --git a/pyrefly.toml b/pyrefly.toml
@@ -1,5 +1,6 @@
 python-version = "3.12.0"
 replace-imports-with-any = [
+    "nemo_automodel.*",
     "pynvml.*",
     "hydra._internal.*",
     "hydra.core.override_parser.*",
diff --git a/uv.lock b/uv.lock