Remove 4-GPU limit and support unlimited GPUs with optional limiting

jeremymanning · jeremymanning · commit 40920c2b25e9 · 2025-09-17T12:39:04.000-04:00
- Remove arbitrary limit of 4 GPUs - now uses ALL available GPUs by default
- Add --max-gpus/-g flag to optionally limit the number of GPUs used
- Update both run_llm_stylometry.sh and generate_figures.py with new flag
- Pass MAX_GPUS environment variable through to main.py
- Show appropriate message when GPU usage is limited vs using all GPUs

Usage examples:
- ./run_llm_stylometry.sh -t         # Use all available GPUs
- ./run_llm_stylometry.sh -t -g 2    # Limit to 2 GPUs
- python code/generate_figures.py --train --max-gpus 4  # Limit to 4 GPUs

This allows better scalability on large GPU clusters while still allowing
users to limit GPU usage if they need to share resources.
diff --git a/code/generate_figures.py b/code/generate_figures.py
@@ -21,7 +21,7 @@
 from llm_stylometry.cli_utils import safe_print, format_header, is_windows
 
 
-def train_models():
+def train_models(max_gpus=None):
     """Train all models from scratch."""
     safe_print("\n" + "=" * 60)
     safe_print("Training Models from Scratch")
@@ -78,6 +78,10 @@ def train_models():
         env['NO_MULTIPROCESSING'] = '1'
         # Set PyTorch memory management for better GPU memory usage
         env['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
+        # Pass through max GPUs limit if specified
+        if max_gpus:
+            env['MAX_GPUS'] = str(max_gpus)
+            safe_print(f"Limiting to {max_gpus} GPU(s)")
         # Run without capturing output so we can see progress
         result = subprocess.run([sys.executable, 'code/main.py'], env=env, check=False)
         if result.returncode != 0:
@@ -194,6 +198,13 @@ def main():
         help='List available figures'
     )
 
+    parser.add_argument(
+        '--max-gpus', '-g',
+        type=int,
+        help='Maximum number of GPUs to use for training (default: all available)',
+        default=None
+    )
+
     args = parser.parse_args()
 
     if args.list:
@@ -211,7 +222,7 @@ def main():
 
     # Train models if requested
     if args.train:
-        if not train_models():
+        if not train_models(max_gpus=args.max_gpus):
             return 1
         # Update data path to use newly generated results
         args.data = 'data/model_results.pkl'
diff --git a/code/main.py b/code/main.py
@@ -300,8 +300,13 @@ def run_experiment(exp: Experiment, device_queue, device_type="cuda"):
 
     # Use already detected device configuration
     if device_type == "cuda":
-        gpu_count = min(device_count, 4)
-        print(f"Using {gpu_count} GPUs out of {device_count} available")
+        # Check for MAX_GPUS environment variable to optionally limit GPU usage
+        max_gpus = int(os.environ.get('MAX_GPUS', '0')) or device_count
+        gpu_count = min(device_count, max_gpus)
+        if gpu_count < device_count:
+            print(f"Using {gpu_count} GPUs (limited by MAX_GPUS) out of {device_count} available")
+        else:
+            print(f"Using all {gpu_count} available GPUs")
     elif device_type == "mps":
         gpu_count = 1
         print("Using Apple Metal Performance Shaders (MPS)")
diff --git a/run_llm_stylometry.sh b/run_llm_stylometry.sh
@@ -35,6 +35,7 @@ OPTIONS:
     -h, --help              Show this help message
     -f, --figure FIGURE     Generate specific figure (1a, 1b, 2a, 2b, 3, 4, 5)
     -t, --train             Train models from scratch before generating figures
+    -g, --max-gpus NUM      Maximum number of GPUs to use for training (default: all)
     -d, --data PATH         Path to model_results.pkl (default: data/model_results.pkl)
     -o, --output DIR        Output directory for figures (default: paper/figs/source)
     -l, --list              List available figures
@@ -48,7 +49,8 @@ EXAMPLES:
     $0                      # Setup environment and generate all figures
     $0 -f 1a                # Generate only Figure 1A
     $0 -f 4                 # Generate only Figure 4 (MDS plot)
-    $0 -t                   # Train models from scratch, then generate figures
+    $0 -t                   # Train models from scratch using all GPUs
+    $0 -t -g 2              # Train models using only 2 GPUs
     $0 -l                   # List available figures
     $0 --setup-only         # Only setup the environment
     $0 --clean              # Remove environment and reinstall from scratch
@@ -278,6 +280,7 @@ setup_environment() {
 # Parse command line arguments
 FIGURE=""
 TRAIN=false
+MAX_GPUS=""
 DATA_PATH="data/model_results.pkl"
 OUTPUT_DIR="paper/figs/source"
 LIST_FIGURES=false
@@ -301,6 +304,10 @@ while [[ $# -gt 0 ]]; do
             TRAIN=true
             shift
             ;;
+        -g|--max-gpus)
+            MAX_GPUS="$2"
+            shift 2
+            ;;
         -d|--data)
             DATA_PATH="$2"
             shift 2
@@ -410,6 +417,10 @@ if [ "$TRAIN" = true ]; then
     PYTHON_CMD="$PYTHON_CMD --train"
 fi
 
+if [ -n "$MAX_GPUS" ]; then
+    PYTHON_CMD="$PYTHON_CMD --max-gpus $MAX_GPUS"
+fi
+
 if [ "$DATA_PATH" != "data/model_results.pkl" ]; then
     PYTHON_CMD="$PYTHON_CMD --data $DATA_PATH"
 fi