Skip to content

Commit dccdf79

Browse files
authored
fix: colocated.resources.gpus_per_node is now required for colocated setups (NVIDIA-NeMo#1273)
Signed-off-by: Terry Kong <terryk@nvidia.com>
1 parent 9198918 commit dccdf79

File tree

5 files changed

+257
-14
lines changed

5 files changed

+257
-14
lines changed

examples/configs/recipes/llm/grpo-llama3.1-8b-instruct-2n8g-fsdp2tp1-noncolocated.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ policy:
4242
colocated:
4343
enabled: false
4444
resources:
45+
gpus_per_node: 8
4546
num_nodes: 1
4647
data:
4748
max_input_seq_length: 4096

nemo_rl/algorithms/distillation.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -304,8 +304,10 @@ def setup(
304304

305305
# validate and configure resources
306306
if cluster_config["num_nodes"] == 1:
307-
assert inference_gpus_per_node > 0, (
308-
"policy.generation.colocated.resources.gpus_per_node must be > 0 "
307+
assert (
308+
inference_gpus_per_node is not None and inference_gpus_per_node > 0
309+
), (
310+
"policy.generation.colocated.resources.gpus_per_node must be explicitly set to a value > 0 "
309311
"when cluster.num_nodes = 1 and inference is non-colocated, "
310312
f"but got {inference_gpus_per_node}."
311313
)
@@ -323,14 +325,13 @@ def setup(
323325
f"but got {inference_nodes}."
324326
)
325327
assert (
326-
inference_gpus_per_node is None
327-
or inference_gpus_per_node == cluster_config["gpus_per_node"]
328+
inference_gpus_per_node is not None
329+
and inference_gpus_per_node == cluster_config["gpus_per_node"]
328330
), (
329-
"policy.generation.colocated.resources.gpus_per_node must be equal to cluster.gpus_per_node or set to null "
331+
"policy.generation.colocated.resources.gpus_per_node must be explicitly set and equal to cluster.gpus_per_node "
330332
"when cluster.num_nodes > 1 and inference is non-colocated, "
331-
f"but got {inference_gpus_per_node}."
333+
f"but got inference_gpus_per_node={inference_gpus_per_node}, cluster.gpus_per_node={cluster_config['gpus_per_node']}."
332334
)
333-
inference_gpus_per_node = cluster_config["gpus_per_node"]
334335
train_nodes -= inference_nodes
335336

336337
# create clusters

nemo_rl/algorithms/grpo.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -305,8 +305,10 @@ def setup(
305305
# validate and configure resources
306306
if policy_nodes == 1:
307307
# When policy_nodes == 1, train and inference are on the same node
308-
assert inference_gpus_per_node > 0, (
309-
"policy.generation.colocated.resources.gpus_per_node must be > 0 "
308+
assert (
309+
inference_gpus_per_node is not None and inference_gpus_per_node > 0
310+
), (
311+
"policy.generation.colocated.resources.gpus_per_node must be explicitly set to a value > 0 "
310312
"when policy_nodes = 1 and inference is non-colocated, "
311313
f"but got {inference_gpus_per_node}."
312314
)
@@ -339,14 +341,13 @@ def setup(
339341
f"but got {inference_nodes}."
340342
)
341343
assert (
342-
inference_gpus_per_node is None
343-
or inference_gpus_per_node == cluster_config["gpus_per_node"]
344+
inference_gpus_per_node is not None
345+
and inference_gpus_per_node == cluster_config["gpus_per_node"]
344346
), (
345-
"policy.generation.colocated.resources.gpus_per_node must be equal to cluster.gpus_per_node or set to null "
347+
"policy.generation.colocated.resources.gpus_per_node must be explicitly set and equal to cluster.gpus_per_node "
346348
"when cluster.num_nodes > 1 and inference is non-colocated, "
347-
f"but got {inference_gpus_per_node}."
349+
f"but got inference_gpus_per_node={inference_gpus_per_node}, cluster.gpus_per_node={cluster_config['gpus_per_node']}."
348350
)
349-
inference_gpus_per_node = cluster_config["gpus_per_node"]
350351
train_nodes -= inference_nodes
351352

352353
# initialize train cluster

tests/unit/algorithms/test_distillation.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,3 +349,131 @@ def test_check_vocab_equality_config_vocab_size_mismatch_raises(monkeypatch):
349349

350350
with pytest.raises(AssertionError):
351351
check_vocab_equality(student_tokenizer, "student-model", "teacher-model")
352+
353+
354+
def test_noncolocated_inference_requires_explicit_gpus_per_node_single_node():
355+
"""Test that non-colocated inference requires explicit gpus_per_node when cluster.num_nodes=1."""
356+
from unittest.mock import MagicMock, patch
357+
358+
from nemo_rl.algorithms.distillation import setup
359+
360+
# Create minimal config with non-colocated inference but gpus_per_node=None
361+
master_config = {
362+
"policy": {
363+
"generation": {
364+
"backend": "vllm",
365+
"colocated": {
366+
"enabled": False, # Non-colocated
367+
"resources": {
368+
"gpus_per_node": None, # This should trigger error
369+
"num_nodes": None,
370+
},
371+
},
372+
},
373+
"dtensor_cfg": {
374+
"enabled": False,
375+
},
376+
},
377+
"teacher": {
378+
"dtensor_cfg": {
379+
"enabled": False,
380+
},
381+
},
382+
"loss_fn": {},
383+
"distillation": {
384+
"seed": 42,
385+
"topk_logits_k": 64,
386+
"num_prompts_per_step": 1, # Config extraction requires this key
387+
"val_period": 0, # Config extraction requires this key
388+
"val_at_start": False, # Config extraction requires this key
389+
},
390+
"data": {"shuffle": False},
391+
"logger": {}, # Config extraction requires this key
392+
"checkpointing": {}, # Config extraction requires this key
393+
"cluster": {
394+
"num_nodes": 1, # Single node
395+
"gpus_per_node": 8,
396+
},
397+
}
398+
399+
tokenizer = MagicMock()
400+
dataset = MagicMock()
401+
dataset.__len__ = MagicMock(return_value=10)
402+
403+
# Mock everything we don't need to test
404+
with (
405+
patch("nemo_rl.algorithms.distillation.Logger") as mock_logger,
406+
patch("nemo_rl.algorithms.distillation.CheckpointManager") as mock_checkpointer,
407+
patch("nemo_rl.algorithms.distillation.StatefulDataLoader"),
408+
pytest.raises(
409+
AssertionError,
410+
match="policy.generation.colocated.resources.gpus_per_node must be explicitly set",
411+
),
412+
):
413+
# Configure mocks to skip checkpoint loading
414+
mock_checkpointer.return_value.get_latest_checkpoint_path.return_value = None
415+
setup(master_config, tokenizer, dataset, None)
416+
417+
418+
def test_noncolocated_inference_requires_explicit_gpus_per_node_multi_node():
419+
"""Test that non-colocated inference requires explicit gpus_per_node when cluster.num_nodes>1."""
420+
from unittest.mock import MagicMock, patch
421+
422+
from nemo_rl.algorithms.distillation import setup
423+
424+
# Create minimal config with non-colocated inference but gpus_per_node=None
425+
master_config = {
426+
"policy": {
427+
"generation": {
428+
"backend": "vllm",
429+
"colocated": {
430+
"enabled": False, # Non-colocated
431+
"resources": {
432+
"gpus_per_node": None, # This should trigger error
433+
"num_nodes": 1, # Use 1 node for inference
434+
},
435+
},
436+
},
437+
"dtensor_cfg": {
438+
"enabled": False,
439+
},
440+
},
441+
"teacher": {
442+
"dtensor_cfg": {
443+
"enabled": False,
444+
},
445+
},
446+
"loss_fn": {},
447+
"distillation": {
448+
"seed": 42,
449+
"topk_logits_k": 64,
450+
"num_prompts_per_step": 1, # Config extraction requires this key
451+
"val_period": 0, # Config extraction requires this key
452+
"val_at_start": False, # Config extraction requires this key
453+
},
454+
"data": {"shuffle": False},
455+
"logger": {}, # Config extraction requires this key
456+
"checkpointing": {}, # Config extraction requires this key
457+
"cluster": {
458+
"num_nodes": 2, # Multi-node
459+
"gpus_per_node": 8,
460+
},
461+
}
462+
463+
tokenizer = MagicMock()
464+
dataset = MagicMock()
465+
dataset.__len__ = MagicMock(return_value=10)
466+
467+
# Mock everything we don't need to test
468+
with (
469+
patch("nemo_rl.algorithms.distillation.Logger") as mock_logger,
470+
patch("nemo_rl.algorithms.distillation.CheckpointManager") as mock_checkpointer,
471+
patch("nemo_rl.algorithms.distillation.StatefulDataLoader"),
472+
pytest.raises(
473+
AssertionError,
474+
match="policy.generation.colocated.resources.gpus_per_node must be explicitly set",
475+
),
476+
):
477+
# Configure mocks to skip checkpoint loading
478+
mock_checkpointer.return_value.get_latest_checkpoint_path.return_value = None
479+
setup(master_config, tokenizer, dataset, None)

tests/unit/algorithms/test_grpo.py

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -210,3 +210,115 @@ def test_calculate_rewards_missing_environment():
210210
ValueError, match="No environment found for task type: unknown_task"
211211
):
212212
calculate_rewards(batch, task_to_env)
213+
214+
215+
def test_noncolocated_inference_requires_explicit_gpus_per_node_single_node():
216+
"""Test that non-colocated inference requires explicit gpus_per_node when policy_nodes=1."""
217+
from unittest.mock import MagicMock, patch
218+
219+
from nemo_rl.algorithms.grpo import setup
220+
221+
# Create minimal config - only what's needed before the validation we're testing
222+
master_config = {
223+
"policy": {
224+
"generation": {
225+
"backend": "vllm",
226+
"colocated": {
227+
"enabled": False, # Non-colocated
228+
"resources": {
229+
"gpus_per_node": None, # This should trigger error
230+
"num_nodes": None,
231+
},
232+
},
233+
},
234+
},
235+
"loss_fn": {}, # Config extraction requires this key
236+
"env": {}, # Config extraction requires this key
237+
"grpo": {
238+
"seed": 42,
239+
"num_prompts_per_step": 1,
240+
"val_period": 0,
241+
"val_at_start": False,
242+
},
243+
"data": {"shuffle": False},
244+
"logger": {}, # Config extraction requires this key
245+
"checkpointing": {}, # Config extraction requires this key
246+
"cluster": {
247+
"num_nodes": 1, # Single node, so policy_nodes=1
248+
"gpus_per_node": 8,
249+
},
250+
}
251+
252+
tokenizer = MagicMock()
253+
dataset = MagicMock()
254+
dataset.__len__ = MagicMock(return_value=10)
255+
256+
# Mock everything we don't need to test
257+
with (
258+
patch("nemo_rl.algorithms.grpo.Logger") as mock_logger,
259+
patch("nemo_rl.algorithms.grpo.CheckpointManager") as mock_checkpointer,
260+
patch("nemo_rl.algorithms.grpo.StatefulDataLoader"),
261+
pytest.raises(
262+
AssertionError,
263+
match="policy.generation.colocated.resources.gpus_per_node must be explicitly set",
264+
),
265+
):
266+
# Configure mocks to skip checkpoint loading
267+
mock_checkpointer.return_value.get_latest_checkpoint_path.return_value = None
268+
setup(master_config, tokenizer, dataset, None)
269+
270+
271+
def test_noncolocated_inference_requires_explicit_gpus_per_node_multi_node():
272+
"""Test that non-colocated inference requires explicit gpus_per_node when policy_nodes>1."""
273+
from unittest.mock import MagicMock, patch
274+
275+
from nemo_rl.algorithms.grpo import setup
276+
277+
# Create minimal config - only what's needed before the validation we're testing
278+
master_config = {
279+
"policy": {
280+
"generation": {
281+
"backend": "vllm",
282+
"colocated": {
283+
"enabled": False, # Non-colocated
284+
"resources": {
285+
"gpus_per_node": None, # This should trigger error
286+
"num_nodes": 1, # Use 1 node for inference
287+
},
288+
},
289+
},
290+
},
291+
"loss_fn": {}, # Config extraction requires this key
292+
"env": {}, # Config extraction requires this key
293+
"grpo": {
294+
"seed": 42,
295+
"num_prompts_per_step": 1,
296+
"val_period": 0,
297+
"val_at_start": False,
298+
},
299+
"data": {"shuffle": False},
300+
"logger": {}, # Config extraction requires this key
301+
"checkpointing": {}, # Config extraction requires this key
302+
"cluster": {
303+
"num_nodes": 2, # Multi-node, so policy_nodes=1 after subtracting inference
304+
"gpus_per_node": 8,
305+
},
306+
}
307+
308+
tokenizer = MagicMock()
309+
dataset = MagicMock()
310+
dataset.__len__ = MagicMock(return_value=10)
311+
312+
# Mock everything we don't need to test
313+
with (
314+
patch("nemo_rl.algorithms.grpo.Logger") as mock_logger,
315+
patch("nemo_rl.algorithms.grpo.CheckpointManager") as mock_checkpointer,
316+
patch("nemo_rl.algorithms.grpo.StatefulDataLoader"),
317+
pytest.raises(
318+
AssertionError,
319+
match="policy.generation.colocated.resources.gpus_per_node must be explicitly set",
320+
),
321+
):
322+
# Configure mocks to skip checkpoint loading
323+
mock_checkpointer.return_value.get_latest_checkpoint_path.return_value = None
324+
setup(master_config, tokenizer, dataset, None)

0 commit comments

Comments
 (0)