Fix multi-GPU parallel training being disabled

jeremymanning · jeremymanning · commit e8ed6b93a688 · 2025-09-17T13:04:18.000-04:00
- Only disable multiprocessing for single GPU or non-CUDA devices
- Enable parallel training when multiple GPUs are detected
- Add clear logging to show sequential vs parallel mode

The issue was that NO_MULTIPROCESSING=1 was always being set, forcing
sequential training even on multi-GPU systems. Now:
- Multiple GPUs: Parallel training enabled
- Single GPU: Sequential mode (avoids overhead)
- CPU/MPS: Sequential mode (required)

This fixes the issue where 8-GPU systems were only using 1 GPU.
diff --git a/code/generate_figures.py b/code/generate_figures.py
@@ -72,10 +72,22 @@ def train_models(max_gpus=None):
     # Train models
     safe_print("\nTraining models...")
     try:
-        # Set environment to disable tqdm and multiprocessing (which can hang in subprocess)
+        # Set environment variables for training
         env = os.environ.copy()
-        env['DISABLE_TQDM'] = '1'
-        env['NO_MULTIPROCESSING'] = '1'
+        env['DISABLE_TQDM'] = '1'  # Disable progress bars in subprocess
+        # Only disable multiprocessing if we have a single GPU or non-GPU device
+        # With multiple GPUs, we want parallel training
+        if torch.cuda.is_available():
+            gpu_count = torch.cuda.device_count()
+            if gpu_count <= 1:
+                env['NO_MULTIPROCESSING'] = '1'
+                safe_print("Single GPU detected - using sequential mode")
+            else:
+                safe_print(f"Multiple GPUs detected ({gpu_count}) - using parallel training")
+        else:
+            # Non-CUDA device (CPU or MPS)
+            env['NO_MULTIPROCESSING'] = '1'
+            safe_print("Non-CUDA device - using sequential mode")
         # Set PyTorch memory management for better GPU memory usage
         env['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
         # Pass through max GPUs limit if specified