Add --max-epochs support for resuming if limit reached

jeremymanning · jeremymanning · commit 83a4a74bf8d2 · 2025-10-26T13:26:11.000-04:00
- Added --max-epochs flag to remote_train_hf.sh - Default: 50000 (current setting) - Can increase if models hit limit before reaching target loss - Checkpoints saved every epoch - safe to resume If hit 50k limit: just restart with --max-epochs 100000 Also: Added HuggingFace dataset links to documentation Ref: #42, #38
diff --git a/README.md b/README.md
@@ -112,7 +112,7 @@ See the [Package API](#package-api) section for all available functions.
 
 See `models/README.md` for details. Pre-trained weights are not required for generating figures.
 
-**Author datasets on HuggingFace:** Cleaned text corpora for all 8 authors are publicly available on HuggingFace at https://huggingface.co/contextlab (browse datasets). Each corpus includes verified book titles and can be loaded with `from datasets import load_dataset`.
+**Author datasets on HuggingFace:** Cleaned text corpora for all 8 authors are publicly available. See `data/README.md` for dataset links and usage.
 
 ## Analysis Variants
 
diff --git a/remote_train_hf.sh b/remote_train_hf.sh
@@ -23,6 +23,7 @@ CLUSTER=""  # Must be specified with --cluster flag
 TRAIN_AUTHOR=""
 TRAIN_ALL=false
 TARGET_LOSS=0.1
+MAX_EPOCHS=50000
 
 # Parse arguments
 while [[ $# -gt 0 ]]; do
@@ -43,17 +44,22 @@ while [[ $# -gt 0 ]]; do
             TARGET_LOSS="$2"
             shift 2
             ;;
+        --max-epochs)
+            MAX_EPOCHS="$2"
+            shift 2
+            ;;
         -h|--help)
             echo "Usage: $0 [OPTIONS]"
             echo ""
             echo "Train HuggingFace models on remote GPU cluster"
             echo ""
             echo "Options:"
-            echo "  --cluster NAME      GPU cluster name (required)"
-            echo "  --author NAME       Train single author"
-            echo "  --all               Train all 8 authors"
-            echo "  --target-loss LOSS  Target training loss (default: 0.1)"
-            echo "  -h, --help          Show this help"
+            echo "  --cluster NAME        GPU cluster name (required)"
+            echo "  --author NAME         Train single author"
+            echo "  --all                 Train all 8 authors"
+            echo "  --target-loss LOSS    Target training loss (default: 0.1)"
+            echo "  --max-epochs N        Maximum epochs (default: 50000)"
+            echo "  -h, --help            Show this help"
             echo ""
             echo "Examples:"
             echo "  $0 --cluster mycluster --author baum"
@@ -127,7 +133,7 @@ else
     TRAIN_FLAGS="--author $TRAIN_AUTHOR"
 fi
 
-TRAIN_FLAGS="$TRAIN_FLAGS --target-loss $TARGET_LOSS"
+TRAIN_FLAGS="$TRAIN_FLAGS --target-loss $TARGET_LOSS --max-epochs $MAX_EPOCHS"
 
 echo
 print_info "Training configuration:"