NVIDIA-NeMo · zhandaz · Feb 12, 2026 · Feb 12, 2026
@@ -0,0 +1,37 @@
+defaults: ../../grpo_math_1B.yaml
+grpo:
+  max_num_steps: 500
+checkpointing:
+  enabled: false
+  checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-megatron
+  save_period: 100
+policy:
+  model_name: meta-llama/Llama-3.2-1B-Instruct
+  tokenizer:
+    name: meta-llama/Llama-3.2-1B-Instruct
+  optimizer: null
+  megatron_cfg:
+    enabled: true
+    scheduler:
+      lr_warmup_iters: 50
+  dtensor_cfg:
+    enabled: false
+  make_sequence_length_divisible_by: 1
+  generation:
+    max_new_tokens: 512
+    vllm_cfg:
+      max_model_len: 512
+    temperature: 0.8
+    top_p: 0.9
+    top_k: 50
+data:
+  max_input_seq_length: 512
+logger:
+  log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-megatron
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-llama3.2-1b-instruct-1n8g-megatron
+cluster:
+  gpus_per_node: 8
@@ -0,0 +1,35 @@
+defaults: ../../grpo_math_1B.yaml
+grpo:
+  max_num_steps: 500
+checkpointing:
+  enabled: false
+  checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-megatron
+  save_period: 100
+policy:
+  model_name: meta-llama/Llama-3.2-1B-Instruct
+  tokenizer:
+    name: meta-llama/Llama-3.2-1B-Instruct
+  optimizer: null
+  megatron_cfg:
+    enabled: true
+    scheduler:
+      lr_warmup_iters: 50
+  dtensor_cfg:
+    enabled: false
+  make_sequence_length_divisible_by: 1
+  generation:
+    max_new_tokens: 512
+    vllm_cfg:
+      max_model_len: 512
+    temperature: 0.6
+data:
+  max_input_seq_length: 512
+logger:
+  log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-megatron
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-llama3.2-1b-instruct-1n8g-megatron
+cluster:
+  gpus_per_node: 8