Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion apps/grpo/qwen3_32b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

# Global configuration
group_size: 16
local_batch_size: 32 # per-device batch size
local_batch_size: 2 # per-device batch size
max_req_tokens: 1024
max_res_tokens: 1024
model: "Qwen/Qwen3-32B"
Expand Down
1 change: 1 addition & 0 deletions apps/grpo/qwen3_8b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ off_by_n: 1 # Off by one by default
# Observability configuration
metric_logging:
wandb:
entity: agentic-models
project: grpo-training
group: grpo_exp_${oc.env:USER}
logging_mode: global_reduce # global_reduce, per_rank_reduce, per_rank_no_reduce
Expand Down
9 changes: 7 additions & 2 deletions src/forge/controller/launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,14 +136,19 @@ async def get_allocator(self, name: str, num_hosts: int) -> tuple[Any, Any, str]
for role in appdef.roles:
# Note - this is hardcoded to SLURM
# We got this with sinfo
role.resource.memMB = 2062607
role.resource.cpu = 128
role.resource.memMB = 2047962
role.resource.cpu = 192
role.resource.gpu = 8

# Note - we cannot add in an empty workspace, so we create a fake temporary one
temp_workspace = tempfile.mkdtemp(prefix="forge_workspace_")
server_config = Config(
scheduler="slurm",
scheduler_args={
"account": "agentic-models",
"qos": "h100_lowest",
"time": "72:00:00"
},
appdef=appdef,
workspace=monarch.tools.config.workspace.Workspace(dirs=[temp_workspace]),
)
Expand Down
24 changes: 24 additions & 0 deletions submit_grpo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/bash
#SBATCH --job-name=grpo-qwen3-32b
#SBATCH --qos=h200_agentic-models_high
#SBATCH --account=agentic-models
#SBATCH --nodes=1
#SBATCH --ntasks-per-node=1
#SBATCH --gpus-per-node=8
#SBATCH --cpus-per-task=128
#SBATCH --mem=500G
#SBATCH --time=72:00:00

echo "Starting GRPO training job"

eval "$(conda shell.bash hook)"

conda activate forge

export TORCH_COMPILE_DISABLE=1
unset SLURM_MEM_PER_CPU SLURM_MEM_PER_GPU SLURM_MEM_PER_NODE
export TORCHSTORE_RDMA_ENABLED=0

cd /storage/home/daniellepintz/torchforge

python -m apps.grpo.main --config apps/grpo/qwen3_32b.yaml
Loading