enable training with m2 mac

Abe404 · Abe404 · commit 9e2344b3e5ef · 2025-10-31T15:37:18.000+01:00
diff --git a/trainer/src/model_utils.py b/trainer/src/model_utils.py
@@ -48,14 +48,14 @@ def get_latest_model_paths(model_dir, k):
 
 def load_model(model_path):
     model = UNetGNRes()
-    if torch.cuda.is_available():
+    if torch.cuda.is_available() or torch.backends.mps.is_available():
         try:
             model.load_state_dict(torch.load(model_path))
             model = torch.nn.DataParallel(model)
         except:
             model = torch.nn.DataParallel(model)
             model.load_state_dict(torch.load(model_path))
-        model.cuda()
+        model.to(device)
     else:
         # if you are running on a CPU-only machine, please use torch.load with 
         # map_location=torch.device('cpu') to map your storages to the CPU.
@@ -77,8 +77,8 @@ def create_first_model_with_random_weights(model_dir):
     model_path = os.path.join(model_dir, model_name)
     torch.save(model.state_dict(), model_path)
 
-    if torch.cuda.is_available():
-        model.cuda()
+    if torch.cuda.is_available() or torch.backends.mps.is_avilable():
+        model.to(device)
     return model
 
 
@@ -292,14 +292,13 @@ def unet_segment(cnn, image, bs, in_w, out_w, threshold=0.5):
                 tile_idx += 1
                 tiles_to_process.append(tile)
         tiles_for_gpu = torch.from_numpy(np.array(tiles_to_process))
-        if torch.cuda.is_available():
-            tiles_for_gpu.cuda()
+        tiles_for_gpu = tiles_for_gpu.to(device)
         tiles_for_gpu = tiles_for_gpu.float()
         batches.append(tiles_for_gpu)
 
     output_tiles = []
     for gpu_tiles in batches:
-        outputs = cnn(gpu_tiles.cuda())
+        outputs = cnn(gpu_tiles.to(device))
         softmaxed = softmax(outputs, 1)
         foreground_probs = softmaxed[:, 1, :]  # just the foreground probability.
         if threshold is not None:
diff --git a/trainer/src/trainer.py b/trainer/src/trainer.py
@@ -89,14 +89,25 @@ def __init__(self, sync_dir=None, patch_size=572,
         total_mem = 0
         self.num_workers=min(multiprocessing.cpu_count(), max_workers)
         print(self.num_workers, 'workers assigned for data loader')
-        print('GPU Available', torch.cuda.is_available())
+        print('CUDA Available', torch.cuda.is_available())
+
         if torch.cuda.is_available():
             for i in range(torch.cuda.device_count()):
                 total_mem += torch.cuda.get_device_properties(i).total_memory
+
+        print('MPS Available', torch.backends.mps.is_available())
+        # MPS only has one device.
+        # There is no obvious way of getting memory for MPS
+        # FIXME: setting arbitrary amount of memory.
+        if torch.backends.mps.is_available():
+            total_mem = 24_589_934_592
+
+        if total_mem > 0: # means CUDA or MPS found
             self.bs = total_mem // mem_per_item
             self.bs = min(12, self.bs)
         else:
             self.bs = 1 # cpu is batch size of 1
+
         print('Batch size', self.bs)
         self.optimizer = None
         # used to check for updates
@@ -287,6 +298,9 @@ def train_one_epoch(self):
         if not [is_photo(a) for a in ls(val_annot_dir)]:
             return
 
+
+        device = model_utils.get_device()
+
         if self.first_loop:
             self.first_loop = False
             self.write_message('Training started')
@@ -313,9 +327,9 @@ def train_one_epoch(self):
                    defined_tiles) in enumerate(train_loader):
 
             self.check_for_instructions()
-            photo_tiles = photo_tiles.cuda()
-            foreground_tiles = foreground_tiles.cuda()
-            defined_tiles = defined_tiles.cuda()
+            photo_tiles = photo_tiles.to(device)
+            foreground_tiles = foreground_tiles.to(device)
+            defined_tiles = defined_tiles.to(device)
             self.optimizer.zero_grad()
             outputs = self.model(photo_tiles)
             softmaxed = softmax(outputs, 1)