Revert "Enable offline logging to wandb for MAST jobs (meta-pytorch#593)" (meta-pytorch#597)

daniellepintz · JenniferWang · web-flow · commit 373882180374 · 2025-11-19T22:53:50.000+01:00
Co-authored-by: Jiyue Wang &lt;JenniferWang@users.noreply.github.com&gt;
diff --git a/.meta/mast/README.md b/.meta/mast/README.md
@@ -119,11 +119,3 @@ This ensures that when MAST runs with `HF_HUB_OFFLINE=1`, the transformers libra
 Both cache and model files are stored under:
 - **Cache**: `/mnt/wsfuse/teamforge/hf` (set via `HF_HOME`)
 - **Model weights**: `/mnt/wsfuse/teamforge/hf/<model_name>`
-
-### Wandb Logs
-Wandb logs will be stored under `/mnt/wsfuse/teamforge/wandb`. The latest run will be stored under `/mnt/wsfuse/teamforge/wandb/latest-run`.
-
-To sync to wandb from a devserver with internet access, run:
-```bash
-wandb sync -p grpo-training /mnt/wsfuse/teamforge/wandb/latest-run
-```
diff --git a/.meta/mast/qwen3_14b_mast.yaml b/.meta/mast/qwen3_14b_mast.yaml
@@ -17,8 +17,8 @@ rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to
 # Observability configuration
 metric_logging:
   wandb:
-    mode: offline
-    dir: /mnt/wsfuse/teamforge/
+    project: "grpo-training"
+    group: "grpo_exp_${oc.env:USER}"
     logging_mode: global_reduce
   console:
     logging_mode: global_reduce
diff --git a/.meta/mast/qwen3_1_7b_mast.yaml b/.meta/mast/qwen3_1_7b_mast.yaml
@@ -17,8 +17,8 @@ rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to
 # Observability configuration
 metric_logging:
   wandb:
-    mode: offline
-    dir: /mnt/wsfuse/teamforge/
+    project: "grpo-training"
+    group: "grpo_exp_${oc.env:USER}"
     logging_mode: global_reduce
   console:
     logging_mode: global_reduce
diff --git a/.meta/mast/qwen3_32b_mast.yaml b/.meta/mast/qwen3_32b_mast.yaml
@@ -17,8 +17,8 @@ rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to
 # Observability configuration
 metric_logging:
   wandb:
-    mode: offline
-    dir: /mnt/wsfuse/teamforge/
+    project: "grpo-training"
+    group: "grpo_exp_${oc.env:USER}"
     logging_mode: global_reduce
   console:
     logging_mode: global_reduce
diff --git a/.meta/mast/qwen3_4b_mast.yaml b/.meta/mast/qwen3_4b_mast.yaml
@@ -17,8 +17,8 @@ rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to
 # Observability configuration
 metric_logging:
   wandb:
-    mode: offline
-    dir: /mnt/wsfuse/teamforge/
+    project: "grpo-training"
+    group: "grpo_exp_${oc.env:USER}"
     logging_mode: global_reduce
   console:
     logging_mode: global_reduce
diff --git a/.meta/mast/qwen3_8b_mast.yaml b/.meta/mast/qwen3_8b_mast.yaml
@@ -17,8 +17,8 @@ rollout_threads: ${services.policy.num_replicas}   # Recommended to set equal to
 # Observability configuration
 metric_logging:
   wandb:
-    mode: offline
-    dir: /mnt/wsfuse/teamforge/
+    project: "grpo-training"
+    group: "grpo_exp_${oc.env:USER}"
     logging_mode: global_reduce
   console:
     logging_mode: global_reduce
diff --git a/src/forge/controller/launcher.py b/src/forge/controller/launcher.py
@@ -293,6 +293,7 @@ def build_appdef(self) -> specs.AppDef:
                 "TORCHDYNAMO_VERBOSE": "1",
                 "VLLM_TORCH_COMPILE_LEVEL": "0",
                 "VLLM_USE_TRITON_FLASH_ATTN": "0",
+                "WANDB_MODE": "offline",
                 "HF_HUB_OFFLINE": "1",
                 "MONARCH_HOST_MESH_V1_REMOVE_ME_BEFORE_RELEASE": "1",
                 "TORCHSTORE_RDMA_ENABLED": "1",