fix: colocated.resources.gpus_per_node is now required for colocated setups (NVIDIA-NeMo#1273)

terrykong · web-flow · commit dccdf79d2775 · 2025-10-05T07:26:42.000Z
Signed-off-by: Terry Kong &lt;terryk@nvidia.com&gt;
diff --git a/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.yaml b/examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.yaml
@@ -42,6 +42,7 @@ policy:
     colocated:
       enabled: false
       resources:
+        gpus_per_node: 8
         num_nodes: 1
 data:
   max_input_seq_length: 4096
diff --git a/nemo_rl/algorithms/distillation.py b/nemo_rl/algorithms/distillation.py
@@ -304,8 +304,10 @@ def setup(
 
         # validate and configure resources
         if cluster_config["num_nodes"] == 1:
-            assert inference_gpus_per_node > 0, (
-                "policy.generation.colocated.resources.gpus_per_node must be > 0 "
+            assert (
+                inference_gpus_per_node is not None and inference_gpus_per_node > 0
+            ), (
+                "policy.generation.colocated.resources.gpus_per_node must be explicitly set to a value > 0 "
                 "when cluster.num_nodes = 1 and inference is non-colocated, "
                 f"but got {inference_gpus_per_node}."
             )
@@ -323,14 +325,13 @@ def setup(
                 f"but got {inference_nodes}."
             )
             assert (
-                inference_gpus_per_node is None
-                or inference_gpus_per_node == cluster_config["gpus_per_node"]
+                inference_gpus_per_node is not None
+                and inference_gpus_per_node == cluster_config["gpus_per_node"]
             ), (
-                "policy.generation.colocated.resources.gpus_per_node must be equal to cluster.gpus_per_node or set to null "
+                "policy.generation.colocated.resources.gpus_per_node must be explicitly set and equal to cluster.gpus_per_node "
                 "when cluster.num_nodes > 1 and inference is non-colocated, "
-                f"but got {inference_gpus_per_node}."
+                f"but got inference_gpus_per_node={inference_gpus_per_node}, cluster.gpus_per_node={cluster_config['gpus_per_node']}."
             )
-            inference_gpus_per_node = cluster_config["gpus_per_node"]
             train_nodes -= inference_nodes
 
         # create clusters
diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
@@ -305,8 +305,10 @@ def setup(
         # validate and configure resources
         if policy_nodes == 1:
             # When policy_nodes == 1, train and inference are on the same node
-            assert inference_gpus_per_node > 0, (
-                "policy.generation.colocated.resources.gpus_per_node must be > 0 "
+            assert (
+                inference_gpus_per_node is not None and inference_gpus_per_node > 0
+            ), (
+                "policy.generation.colocated.resources.gpus_per_node must be explicitly set to a value > 0 "
                 "when policy_nodes = 1 and inference is non-colocated, "
                 f"but got {inference_gpus_per_node}."
             )
@@ -339,14 +341,13 @@ def setup(
                 f"but got {inference_nodes}."
             )
             assert (
-                inference_gpus_per_node is None
-                or inference_gpus_per_node == cluster_config["gpus_per_node"]
+                inference_gpus_per_node is not None
+                and inference_gpus_per_node == cluster_config["gpus_per_node"]
             ), (
-                "policy.generation.colocated.resources.gpus_per_node must be equal to cluster.gpus_per_node or set to null "
+                "policy.generation.colocated.resources.gpus_per_node must be explicitly set and equal to cluster.gpus_per_node "
                 "when cluster.num_nodes > 1 and inference is non-colocated, "
-                f"but got {inference_gpus_per_node}."
+                f"but got inference_gpus_per_node={inference_gpus_per_node}, cluster.gpus_per_node={cluster_config['gpus_per_node']}."
             )
-            inference_gpus_per_node = cluster_config["gpus_per_node"]
             train_nodes -= inference_nodes
 
         # initialize train cluster
diff --git a/tests/unit/algorithms/test_distillation.py b/tests/unit/algorithms/test_distillation.py
@@ -349,3 +349,131 @@ def test_check_vocab_equality_config_vocab_size_mismatch_raises(monkeypatch):
 
     with pytest.raises(AssertionError):
         check_vocab_equality(student_tokenizer, "student-model", "teacher-model")
+
+
+def test_noncolocated_inference_requires_explicit_gpus_per_node_single_node():
+    """Test that non-colocated inference requires explicit gpus_per_node when cluster.num_nodes=1."""
+    from unittest.mock import MagicMock, patch
+
+    from nemo_rl.algorithms.distillation import setup
+
+    # Create minimal config with non-colocated inference but gpus_per_node=None
+    master_config = {
+        "policy": {
+            "generation": {
+                "backend": "vllm",
+                "colocated": {
+                    "enabled": False,  # Non-colocated
+                    "resources": {
+                        "gpus_per_node": None,  # This should trigger error
+                        "num_nodes": None,
+                    },
+                },
+            },
+            "dtensor_cfg": {
+                "enabled": False,
+            },
+        },
+        "teacher": {
+            "dtensor_cfg": {
+                "enabled": False,
+            },
+        },
+        "loss_fn": {},
+        "distillation": {
+            "seed": 42,
+            "topk_logits_k": 64,
+            "num_prompts_per_step": 1,  # Config extraction requires this key
+            "val_period": 0,  # Config extraction requires this key
+            "val_at_start": False,  # Config extraction requires this key
+        },
+        "data": {"shuffle": False},
+        "logger": {},  # Config extraction requires this key
+        "checkpointing": {},  # Config extraction requires this key
+        "cluster": {
+            "num_nodes": 1,  # Single node
+            "gpus_per_node": 8,
+        },
+    }
+
+    tokenizer = MagicMock()
+    dataset = MagicMock()
+    dataset.__len__ = MagicMock(return_value=10)
+
+    # Mock everything we don't need to test
+    with (
+        patch("nemo_rl.algorithms.distillation.Logger") as mock_logger,
+        patch("nemo_rl.algorithms.distillation.CheckpointManager") as mock_checkpointer,
+        patch("nemo_rl.algorithms.distillation.StatefulDataLoader"),
+        pytest.raises(
+            AssertionError,
+            match="policy.generation.colocated.resources.gpus_per_node must be explicitly set",
+        ),
+    ):
+        # Configure mocks to skip checkpoint loading
+        mock_checkpointer.return_value.get_latest_checkpoint_path.return_value = None
+        setup(master_config, tokenizer, dataset, None)
+
+
+def test_noncolocated_inference_requires_explicit_gpus_per_node_multi_node():
+    """Test that non-colocated inference requires explicit gpus_per_node when cluster.num_nodes>1."""
+    from unittest.mock import MagicMock, patch
+
+    from nemo_rl.algorithms.distillation import setup
+
+    # Create minimal config with non-colocated inference but gpus_per_node=None
+    master_config = {
+        "policy": {
+            "generation": {
+                "backend": "vllm",
+                "colocated": {
+                    "enabled": False,  # Non-colocated
+                    "resources": {
+                        "gpus_per_node": None,  # This should trigger error
+                        "num_nodes": 1,  # Use 1 node for inference
+                    },
+                },
+            },
+            "dtensor_cfg": {
+                "enabled": False,
+            },
+        },
+        "teacher": {
+            "dtensor_cfg": {
+                "enabled": False,
+            },
+        },
+        "loss_fn": {},
+        "distillation": {
+            "seed": 42,
+            "topk_logits_k": 64,
+            "num_prompts_per_step": 1,  # Config extraction requires this key
+            "val_period": 0,  # Config extraction requires this key
+            "val_at_start": False,  # Config extraction requires this key
+        },
+        "data": {"shuffle": False},
+        "logger": {},  # Config extraction requires this key
+        "checkpointing": {},  # Config extraction requires this key
+        "cluster": {
+            "num_nodes": 2,  # Multi-node
+            "gpus_per_node": 8,
+        },
+    }
+
+    tokenizer = MagicMock()
+    dataset = MagicMock()
+    dataset.__len__ = MagicMock(return_value=10)
+
+    # Mock everything we don't need to test
+    with (
+        patch("nemo_rl.algorithms.distillation.Logger") as mock_logger,
+        patch("nemo_rl.algorithms.distillation.CheckpointManager") as mock_checkpointer,
+        patch("nemo_rl.algorithms.distillation.StatefulDataLoader"),
+        pytest.raises(
+            AssertionError,
+            match="policy.generation.colocated.resources.gpus_per_node must be explicitly set",
+        ),
+    ):
+        # Configure mocks to skip checkpoint loading
+        mock_checkpointer.return_value.get_latest_checkpoint_path.return_value = None
+        setup(master_config, tokenizer, dataset, None)
diff --git a/tests/unit/algorithms/test_grpo.py b/tests/unit/algorithms/test_grpo.py
@@ -210,3 +210,115 @@ def test_calculate_rewards_missing_environment():
         ValueError, match="No environment found for task type: unknown_task"
     ):
         calculate_rewards(batch, task_to_env)
+
+
+def test_noncolocated_inference_requires_explicit_gpus_per_node_single_node():
+    """Test that non-colocated inference requires explicit gpus_per_node when policy_nodes=1."""
+    from unittest.mock import MagicMock, patch
+
+    from nemo_rl.algorithms.grpo import setup
+
+    # Create minimal config - only what's needed before the validation we're testing
+    master_config = {
+        "policy": {
+            "generation": {
+                "backend": "vllm",
+                "colocated": {
+                    "enabled": False,  # Non-colocated
+                    "resources": {
+                        "gpus_per_node": None,  # This should trigger error
+                        "num_nodes": None,
+                    },
+                },
+            },
+        },
+        "loss_fn": {},  # Config extraction requires this key
+        "env": {},  # Config extraction requires this key
+        "grpo": {
+            "seed": 42,
+            "num_prompts_per_step": 1,
+            "val_period": 0,
+            "val_at_start": False,
+        },
+        "data": {"shuffle": False},
+        "logger": {},  # Config extraction requires this key
+        "checkpointing": {},  # Config extraction requires this key
+        "cluster": {
+            "num_nodes": 1,  # Single node, so policy_nodes=1
+            "gpus_per_node": 8,
+        },
+    }
+
+    tokenizer = MagicMock()
+    dataset = MagicMock()
+    dataset.__len__ = MagicMock(return_value=10)
+
+    # Mock everything we don't need to test
+    with (
+        patch("nemo_rl.algorithms.grpo.Logger") as mock_logger,
+        patch("nemo_rl.algorithms.grpo.CheckpointManager") as mock_checkpointer,
+        patch("nemo_rl.algorithms.grpo.StatefulDataLoader"),
+        pytest.raises(
+            AssertionError,
+            match="policy.generation.colocated.resources.gpus_per_node must be explicitly set",
+        ),
+    ):
+        # Configure mocks to skip checkpoint loading
+        mock_checkpointer.return_value.get_latest_checkpoint_path.return_value = None
+        setup(master_config, tokenizer, dataset, None)
+
+
+def test_noncolocated_inference_requires_explicit_gpus_per_node_multi_node():
+    """Test that non-colocated inference requires explicit gpus_per_node when policy_nodes>1."""
+    from unittest.mock import MagicMock, patch
+
+    from nemo_rl.algorithms.grpo import setup
+
+    # Create minimal config - only what's needed before the validation we're testing
+    master_config = {
+        "policy": {
+            "generation": {
+                "backend": "vllm",
+                "colocated": {
+                    "enabled": False,  # Non-colocated
+                    "resources": {
+                        "gpus_per_node": None,  # This should trigger error
+                        "num_nodes": 1,  # Use 1 node for inference
+                    },
+                },
+            },
+        },
+        "loss_fn": {},  # Config extraction requires this key
+        "env": {},  # Config extraction requires this key
+        "grpo": {
+            "seed": 42,
+            "num_prompts_per_step": 1,
+            "val_period": 0,
+            "val_at_start": False,
+        },
+        "data": {"shuffle": False},
+        "logger": {},  # Config extraction requires this key
+        "checkpointing": {},  # Config extraction requires this key
+        "cluster": {
+            "num_nodes": 2,  # Multi-node, so policy_nodes=1 after subtracting inference
+            "gpus_per_node": 8,
+        },
+    }
+
+    tokenizer = MagicMock()
+    dataset = MagicMock()
+    dataset.__len__ = MagicMock(return_value=10)
+
+    # Mock everything we don't need to test
+    with (
+        patch("nemo_rl.algorithms.grpo.Logger") as mock_logger,
+        patch("nemo_rl.algorithms.grpo.CheckpointManager") as mock_checkpointer,
+        patch("nemo_rl.algorithms.grpo.StatefulDataLoader"),
+        pytest.raises(
+            AssertionError,
+            match="policy.generation.colocated.resources.gpus_per_node must be explicitly set",
+        ),
+    ):
+        # Configure mocks to skip checkpoint loading
+        mock_checkpointer.return_value.get_latest_checkpoint_path.return_value = None
+        setup(master_config, tokenizer, dataset, None)