hydrate cache

allenwang28 · allenwang28 · commit a2b3a9ea8d0e · 2025-10-16T10:01:34.000-07:00
diff --git a/.meta/mast/README.md b/.meta/mast/README.md
@@ -44,3 +44,48 @@ The launch script will automatically:
 - Launch the MAST job with the specified config
 
 You can run it from anywhere, and it will figure out the correct paths.
+
+
+## Managing HuggingFace Models in MAST
+
+### The Problem: No Internet Access
+
+MAST compute nodes cannot access the internet, which means they cannot download models directly from HuggingFace. To work around this, we store all HuggingFace models and cache data on OilFS at `/mnt/wsfuse/teamforge/hf`, which is accessible from MAST.
+
+### Solution: Two-Step Process
+
+You need to perform both steps below to ensure models work correctly in MAST:
+
+#### 1. Download Model Weights to OilFS
+
+First, download the model weights directly to the OilFS path. This should be done from a machine with internet access (like your devserver):
+
+```bash
+# Set HF_HOME to the OilFS path
+export HF_HOME=/mnt/wsfuse/teamforge/hf
+
+# Download the model (replace with your desired model)
+huggingface-cli download Qwen/Qwen3-8B --local-dir /mnt/wsfuse/teamforge/hf_artifacts/qwen3_8b
+```
+
+#### 2. Hydrate the HuggingFace Cache
+
+After downloading the weights, you need to hydrate the HuggingFace cache so that the transformers library can find the model metadata:
+
+```bash
+# Set HF_HOME to the OilFS path
+export HF_HOME=/mnt/wsfuse/teamforge/hf
+
+# Hydrate the cache for the model
+python .meta/mast/hydrate_cache.py --model-id Qwen/Qwen3-8B
+```
+
+This ensures that when MAST runs with `HF_HUB_OFFLINE=1`, the transformers library can locate all necessary files from the cache.
+
+### Directory Structure
+
+Both cache and model files are stored under:
+- **Cache**: `/mnt/wsfuse/teamforge/hf` (set via `HF_HOME`)
+- **Model weights**: `/mnt/wsfuse/teamforge/hf/<model_name>`
+
+Make sure your MAST config files point to the correct paths in `hf_artifacts`.
diff --git a/.meta/mast/hydrate_cache.py b/.meta/mast/hydrate_cache.py
@@ -0,0 +1,56 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""This is convenience script meant for hydrating the HuggingFace cache.
+
+This is meant for downloading the model weights and tokenizer to the cache, i.e. for
+OilFS.
+
+Example:
+
+python .meta/mast/hydrate_cache.py --model-id Qwen/Qwen3-32B
+
+"""
+import argparse
+import os
+import sys
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Hydrate HuggingFace cache for a specific model"
+    )
+    parser.add_argument(
+        "--model-id",
+        type=str,
+        required=True,
+        help="HuggingFace model ID (e.g., Qwen/Qwen3-8B)",
+    )
+    args = parser.parse_args()
+
+    # Ensure HF_HOME is set
+    hf_home = os.environ.get("HF_HOME")
+    if not hf_home:
+        print(
+            "ERROR: HF_HOME environment variable must be set. "
+            "You will likely want to run export HF_HOME=/mnt/wsfuse/teamforge/hf."
+        )
+        sys.exit(1)
+
+    print(f"Using HF_HOME: {hf_home}")
+    print(f"Downloading {args.model_id}...")
+
+    # This will pull tokenizer + config + all weight shards
+    tokenizer = AutoTokenizer.from_pretrained(args.model_id, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(args.model_id, trust_remote_code=True)
+
+    print("Download complete. Cache hydrated.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/forge/controller/launcher.py b/src/forge/controller/launcher.py
@@ -296,7 +296,8 @@ def build_appdef(self) -> specs.AppDef:
                 "HF_HUB_OFFLINE": "1",
                 "MONARCH_HOST_MESH_V1_REMOVE_ME_BEFORE_RELEASE": "1",
                 "TORCHSTORE_RDMA_ENABLED": "1",
-                # "HF_HOME": "/mnt/wsfuse/teamforge/hf",
+                "HF_HOME": "/mnt/wsfuse/teamforge/hf",
+                "TRANSFORMERS_OFFLINE": "1",
             },
         }
 

Original file line number	Diff line number	Diff line change
`@@ -296,7 +296,8 @@ def build_appdef(self) -> specs.AppDef:`
`296`	`296`	`"HF_HUB_OFFLINE": "1",`
`297`	`297`	`"MONARCH_HOST_MESH_V1_REMOVE_ME_BEFORE_RELEASE": "1",`
`298`	`298`	`"TORCHSTORE_RDMA_ENABLED": "1",`
`299`		`- # "HF_HOME": "/mnt/wsfuse/teamforge/hf",`
	`299`	`+ "HF_HOME": "/mnt/wsfuse/teamforge/hf",`
	`300`	`+ "TRANSFORMERS_OFFLINE": "1",`
`300`	`301`	`},`
`301`	`302`	`}`
`302`	`303`