Enable offline logging to wandb for MAST jobs (meta-pytorch#593)

daniellepintz · web-flow · commit a8415125c519 · 2025-11-19T21:49:02.000+01:00
diff --git a/.meta/mast/README.md b/.meta/mast/README.md
@@ -119,3 +119,11 @@ This ensures that when MAST runs with `HF_HUB_OFFLINE=1`, the transformers libra
 Both cache and model files are stored under:
 - **Cache**: `/mnt/wsfuse/teamforge/hf` (set via `HF_HOME`)
 - **Model weights**: `/mnt/wsfuse/teamforge/hf/<model_name>`
+
+### Wandb Logs
+Wandb logs will be stored under `/mnt/wsfuse/teamforge/wandb`. The latest run will be stored under `/mnt/wsfuse/teamforge/wandb/latest-run`.
+
+To sync to wandb from a devserver with internet access, run:
+```bash
+wandb sync -p grpo-training /mnt/wsfuse/teamforge/wandb/latest-run
+```
diff --git a/.meta/mast/qwen3_14b_mast.yaml b/.meta/mast/qwen3_14b_mast.yaml
@@ -17,8 +17,8 @@ rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to
 # Observability configuration
 metric_logging:
   wandb:
-    project: "grpo-training"
-    group: "grpo_exp_${oc.env:USER}"
+    mode: offline
+    dir: /mnt/wsfuse/teamforge/
     logging_mode: global_reduce
   console:
     logging_mode: global_reduce
diff --git a/.meta/mast/qwen3_1_7b_mast.yaml b/.meta/mast/qwen3_1_7b_mast.yaml
@@ -17,8 +17,8 @@ rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to
 # Observability configuration
 metric_logging:
   wandb:
-    project: "grpo-training"
-    group: "grpo_exp_${oc.env:USER}"
+    mode: offline
+    dir: /mnt/wsfuse/teamforge/
     logging_mode: global_reduce
   console:
     logging_mode: global_reduce
diff --git a/.meta/mast/qwen3_32b_mast.yaml b/.meta/mast/qwen3_32b_mast.yaml
@@ -17,8 +17,8 @@ rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to
 # Observability configuration
 metric_logging:
   wandb:
-    project: "grpo-training"
-    group: "grpo_exp_${oc.env:USER}"
+    mode: offline
+    dir: /mnt/wsfuse/teamforge/
     logging_mode: global_reduce
   console:
     logging_mode: global_reduce
diff --git a/.meta/mast/qwen3_4b_mast.yaml b/.meta/mast/qwen3_4b_mast.yaml
@@ -17,8 +17,8 @@ rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to
 # Observability configuration
 metric_logging:
   wandb:
-    project: "grpo-training"
-    group: "grpo_exp_${oc.env:USER}"
+    mode: offline
+    dir: /mnt/wsfuse/teamforge/
     logging_mode: global_reduce
   console:
     logging_mode: global_reduce
diff --git a/.meta/mast/qwen3_8b_mast.yaml b/.meta/mast/qwen3_8b_mast.yaml
@@ -17,8 +17,8 @@ rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to
 # Observability configuration
 metric_logging:
   wandb:
-    project: "grpo-training"
-    group: "grpo_exp_${oc.env:USER}"
+    mode: offline
+    dir: /mnt/wsfuse/teamforge/
     logging_mode: global_reduce
   console:
     logging_mode: global_reduce
diff --git a/src/forge/controller/launcher.py b/src/forge/controller/launcher.py
@@ -293,7 +293,6 @@ def build_appdef(self) -> specs.AppDef:
                 "TORCHDYNAMO_VERBOSE": "1",
                 "VLLM_TORCH_COMPILE_LEVEL": "0",
                 "VLLM_USE_TRITON_FLASH_ATTN": "0",
-                "WANDB_MODE": "offline",
                 "HF_HUB_OFFLINE": "1",
                 "MONARCH_HOST_MESH_V1_REMOVE_ME_BEFORE_RELEASE": "1",
                 "TORCHSTORE_RDMA_ENABLED": "1",