From 7383566136983588d63490767320fda7d3c8f55b Mon Sep 17 00:00:00 2001 From: Rypo Date: Tue, 26 Nov 2024 15:43:04 -0600 Subject: [PATCH 1/2] fix: CUDA error: out of memory on non_blocking calls Removes non_blocking argument from all device to cpu transfers. In certain environments (e.g. WSL) large transfers will throw a CUDA memory error regardless of VRAM available. Adjusts stream synchronize for modest performance gains with cpu_offload. fixes #90, fixes #117 --- OmniGen/scheduler.py | 8 ++++---- OmniGen/transformer.py | 22 +++++++++++----------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/OmniGen/scheduler.py b/OmniGen/scheduler.py index e63dab0..8e20ec4 100644 --- a/OmniGen/scheduler.py +++ b/OmniGen/scheduler.py @@ -38,8 +38,8 @@ def evict_previous_layer(self, layer_idx: int): prev_layer_idx = -1 else: prev_layer_idx = (layer_idx - 1) % len(self) - self.key_cache[prev_layer_idx] = self.key_cache[prev_layer_idx].to("cpu", non_blocking=True) - self.value_cache[prev_layer_idx] = self.value_cache[prev_layer_idx].to("cpu", non_blocking=True) + self.key_cache[prev_layer_idx] = self.key_cache[prev_layer_idx].to("cpu") + self.value_cache[prev_layer_idx] = self.value_cache[prev_layer_idx].to("cpu") def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]: @@ -50,9 +50,9 @@ def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]: torch.cuda.current_stream().synchronize() self.evict_previous_layer(layer_idx) # Load current layer cache to its original device if not already there - original_device = self.original_device[layer_idx] + #original_device = self.original_device[layer_idx] # self.prefetch_stream.synchronize(original_device) - torch.cuda.synchronize(self.prefetch_stream) + self.prefetch_stream.synchronize() key_tensor = self.key_cache[layer_idx] value_tensor = self.value_cache[layer_idx] diff --git a/OmniGen/transformer.py b/OmniGen/transformer.py index 4df2006..5b9a2e3 100644 --- a/OmniGen/transformer.py +++ b/OmniGen/transformer.py @@ -33,14 +33,12 @@ def prefetch_layer(self, layer_idx: int, device: torch.device): "Starts prefetching the next layer cache" with torch.cuda.stream(self.prefetch_stream): # Prefetch next layer tensors to GPU - for name, param in self.layers[layer_idx].named_parameters(): - param.data = param.data.to(device, non_blocking=True) + self.layers[layer_idx] = self.layers[layer_idx].to(device, non_blocking=True) def evict_previous_layer(self, layer_idx: int): "Moves the previous layer cache to the CPU" prev_layer_idx = layer_idx - 1 - for name, param in self.layers[prev_layer_idx].named_parameters(): - param.data = param.data.to("cpu", non_blocking=True) + self.layers[prev_layer_idx] = self.layers[prev_layer_idx].to("cpu") def get_offlaod_layer(self, layer_idx: int, device: torch.device): # init stream @@ -48,11 +46,14 @@ def get_offlaod_layer(self, layer_idx: int, device: torch.device): self.prefetch_stream = torch.cuda.Stream() # delete previous layer - torch.cuda.current_stream().synchronize() - self.evict_previous_layer(layer_idx) + # main stream sync shouldn't be necessary since all computation on iter i-1 is finished by iter i + # torch.cuda.current_stream().synchronize() + # avoid extra eviction of last layer + if layer_idx > 0: + self.evict_previous_layer(layer_idx) # make sure the current layer is ready - torch.cuda.synchronize(self.prefetch_stream) + self.prefetch_stream.synchronize() # load next layer self.prefetch_layer((layer_idx + 1) % len(self.layers), device) @@ -133,10 +134,9 @@ def forward( all_self_attns = () if output_attentions else None next_decoder_cache = None - layer_idx = -1 - for decoder_layer in self.layers: - layer_idx += 1 - + for layer_idx in range(len(self.layers)): + # direct indexing since offloading may mutate self.layers during iteration + decoder_layer = self.layers[layer_idx] if output_hidden_states: all_hidden_states += (hidden_states,) From 0fa5f5d69ba4ac5a97fadb911f9d5d82bd7fb8d3 Mon Sep 17 00:00:00 2001 From: Rypo Date: Tue, 3 Dec 2024 08:21:03 -0600 Subject: [PATCH 2/2] fix: revert layer offload iteration --- OmniGen/transformer.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/OmniGen/transformer.py b/OmniGen/transformer.py index 5b9a2e3..ca36b91 100644 --- a/OmniGen/transformer.py +++ b/OmniGen/transformer.py @@ -33,12 +33,14 @@ def prefetch_layer(self, layer_idx: int, device: torch.device): "Starts prefetching the next layer cache" with torch.cuda.stream(self.prefetch_stream): # Prefetch next layer tensors to GPU - self.layers[layer_idx] = self.layers[layer_idx].to(device, non_blocking=True) + for name, param in self.layers[layer_idx].named_parameters(): + param.data = param.data.to(device, non_blocking=True) def evict_previous_layer(self, layer_idx: int): "Moves the previous layer cache to the CPU" prev_layer_idx = layer_idx - 1 - self.layers[prev_layer_idx] = self.layers[prev_layer_idx].to("cpu") + for name, param in self.layers[prev_layer_idx].named_parameters(): + param.data = param.data.to("cpu") def get_offlaod_layer(self, layer_idx: int, device: torch.device): # init stream