NVIDIA-NeMo
diff --git a/‎examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron-sampling.yaml‎
Lines changed: 37 additions & 0 deletions b/‎examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron-sampling.yaml‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron-temp0.6.yaml‎
Lines changed: 35 additions & 0 deletions b/‎examples/configs/recipes/llm/grpo-llama3.2-1b-instruct-1n8g-megatron-temp0.6.yaml‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎nemo_rl/algorithms/loss_functions.py‎
Lines changed: 118 additions & 95 deletions b/‎nemo_rl/algorithms/loss_functions.py‎
Lines changed: 118 additions & 95 deletions
@@ -0,0 +1,37 @@
+defaults: ../../grpo_math_1B.yaml
+grpo:
+  max_num_steps: 500
+checkpointing:
+  enabled: false
+  checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-megatron
+  save_period: 100
+policy:
+  model_name: meta-llama/Llama-3.2-1B-Instruct
+  tokenizer:
+    name: meta-llama/Llama-3.2-1B-Instruct
+  optimizer: null
+  megatron_cfg:
+    enabled: true
+    scheduler:
+      lr_warmup_iters: 50
+  dtensor_cfg:
+    enabled: false
+  make_sequence_length_divisible_by: 1
+  generation:
+    max_new_tokens: 512
+    vllm_cfg:
+      max_model_len: 512
+    temperature: 0.8
+    top_p: 0.9
+    top_k: 50
+data:
+  max_input_seq_length: 512
+logger:
+  log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-megatron
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-llama3.2-1b-instruct-1n8g-megatron
+cluster:
+  gpus_per_node: 8
@@ -0,0 +1,35 @@
+defaults: ../../grpo_math_1B.yaml
+grpo:
+  max_num_steps: 500
+checkpointing:
+  enabled: false
+  checkpoint_dir: results/grpo-llama3.2-1b-instruct-1n8g-megatron
+  save_period: 100
+policy:
+  model_name: meta-llama/Llama-3.2-1B-Instruct
+  tokenizer:
+    name: meta-llama/Llama-3.2-1B-Instruct
+  optimizer: null
+  megatron_cfg:
+    enabled: true
+    scheduler:
+      lr_warmup_iters: 50
+  dtensor_cfg:
+    enabled: false
+  make_sequence_length_divisible_by: 1
+  generation:
+    max_new_tokens: 512
+    vllm_cfg:
+      max_model_len: 512
+    temperature: 0.6
+data:
+  max_input_seq_length: 512
+logger:
+  log_dir: logs/grpo-llama3.2-1b-instruct-1n8g-megatron
+  wandb_enabled: true
+  tensorboard_enabled: true
+  wandb:
+    project: nemo-rl
+    name: grpo-llama3.2-1b-instruct-1n8g-megatron
+cluster:
+  gpus_per_node: 8