fix: CUDA error: out of memory on non_blocking calls

Rypo · Rypo · commit 738356613698 · 2024-12-02T16:26:56.000-06:00
Removes non_blocking argument from all device to cpu transfers. In certain environments (e.g. WSL) large transfers will throw a CUDA memory error regardless of VRAM available. Adjusts stream synchronize for modest performance gains with cpu_offload. fixes #90, fixes #117
diff --git a/OmniGen/scheduler.py b/OmniGen/scheduler.py
@@ -38,8 +38,8 @@ def evict_previous_layer(self, layer_idx: int):
                 prev_layer_idx = -1
             else:
                 prev_layer_idx = (layer_idx - 1) % len(self)
-            self.key_cache[prev_layer_idx] = self.key_cache[prev_layer_idx].to("cpu", non_blocking=True)
-            self.value_cache[prev_layer_idx] = self.value_cache[prev_layer_idx].to("cpu", non_blocking=True)
+            self.key_cache[prev_layer_idx] = self.key_cache[prev_layer_idx].to("cpu")
+            self.value_cache[prev_layer_idx] = self.value_cache[prev_layer_idx].to("cpu")
 
 
     def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
@@ -50,9 +50,9 @@ def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
                 torch.cuda.current_stream().synchronize()
                 self.evict_previous_layer(layer_idx)
                 # Load current layer cache to its original device if not already there
-                original_device = self.original_device[layer_idx]
+                #original_device = self.original_device[layer_idx]
                 # self.prefetch_stream.synchronize(original_device)
-                torch.cuda.synchronize(self.prefetch_stream)
+                self.prefetch_stream.synchronize()
                 key_tensor = self.key_cache[layer_idx]
                 value_tensor = self.value_cache[layer_idx]
                 
diff --git a/OmniGen/transformer.py b/OmniGen/transformer.py
@@ -33,26 +33,27 @@ def prefetch_layer(self, layer_idx: int, device: torch.device):
         "Starts prefetching the next layer cache"
         with torch.cuda.stream(self.prefetch_stream):
             # Prefetch next layer tensors to GPU
-            for name, param in self.layers[layer_idx].named_parameters():
-                param.data = param.data.to(device, non_blocking=True)
+            self.layers[layer_idx] = self.layers[layer_idx].to(device, non_blocking=True)
 
     def evict_previous_layer(self, layer_idx: int):
         "Moves the previous layer cache to the CPU"
         prev_layer_idx = layer_idx - 1
-        for name, param in self.layers[prev_layer_idx].named_parameters():
-            param.data = param.data.to("cpu", non_blocking=True)
+        self.layers[prev_layer_idx] = self.layers[prev_layer_idx].to("cpu")
             
     def get_offlaod_layer(self, layer_idx: int, device: torch.device):
         # init stream
         if not hasattr(self, "prefetch_stream"):
             self.prefetch_stream = torch.cuda.Stream()
 
         # delete previous layer
-        torch.cuda.current_stream().synchronize()
-        self.evict_previous_layer(layer_idx)
+        # main stream sync shouldn't be necessary since all computation on iter i-1 is finished by iter i
+        # torch.cuda.current_stream().synchronize()
+        # avoid extra eviction of last layer
+        if layer_idx > 0:
+            self.evict_previous_layer(layer_idx)
         
         # make sure the current layer is ready
-        torch.cuda.synchronize(self.prefetch_stream)
+        self.prefetch_stream.synchronize()
 
         # load next layer
         self.prefetch_layer((layer_idx + 1) % len(self.layers), device)
@@ -133,10 +134,9 @@ def forward(
         all_self_attns = () if output_attentions else None
         next_decoder_cache = None
 
-        layer_idx = -1
-        for decoder_layer in self.layers:
-            layer_idx += 1
-
+        for layer_idx in range(len(self.layers)):
+            # direct indexing since offloading may mutate self.layers during iteration 
+            decoder_layer = self.layers[layer_idx]
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)