Fix FLUX2 Klein load-time VRAM spikes on low-memory GPUs.

messy-michael · cursoragent · messy-michael · commit 280fb321256c · 2026-02-25T06:51:03.000+01:00
Keep the transformer and Qwen text encoder off CUDA during initial load/quantization in low-VRAM mode so model startup avoids full-model OOM before offloading and quantization can take effect.

Co-authored-by: Cursor &lt;cursoragent@cursor.com&gt;
diff --git a/extensions_built_in/diffusion_models/flux2/flux2_klein_model.py b/extensions_built_in/diffusion_models/flux2/flux2_klein_model.py
@@ -44,15 +44,14 @@ def load_te(self):
             self.flux2_klein_te_path,
             torch_dtype=dtype,
         )
-        text_encoder.to(self.device_torch, dtype=dtype)
-
-        flush()
-
         if self.model_config.quantize_te:
             self.print_and_status_update("Quantizing Qwen3")
-            quantize(text_encoder, weights=get_qtype(self.model_config.qtype))
+            quantize(text_encoder, weights=get_qtype(self.model_config.qtype_te))
             freeze(text_encoder)
             flush()
+        elif not self.model_config.low_vram:
+            text_encoder.to(self.device_torch, dtype=dtype)
+            flush()
 
         if (
             self.model_config.layer_offloading
diff --git a/extensions_built_in/diffusion_models/flux2/flux2_model.py b/extensions_built_in/diffusion_models/flux2/flux2_model.py
@@ -155,11 +155,11 @@ def load_model(self):
 
         transformer.load_state_dict(transformer_state_dict, assign=True)
 
-        transformer.to(self.quantize_device, dtype=dtype)
-
         if self.model_config.quantize:
             # patch the state dict method
             patch_dequantization_on_save(transformer)
+            # Avoid full-model peak VRAM allocation before quantization.
+            self.print_and_status_update("Keeping transformer on CPU for quantization")
             self.print_and_status_update("Quantizing Transformer")
             quantize_model(self, transformer)
             flush()
@@ -234,10 +234,16 @@ def load_model(self):
 
         flush()
         # just to make sure everything is on the right device and dtype
-        text_encoder[0].to(self.device_torch)
+        if self.model_config.low_vram:
+            text_encoder[0].to("cpu")
+        else:
+            text_encoder[0].to(self.device_torch)
         text_encoder[0].requires_grad_(False)
         text_encoder[0].eval()
-        pipe.transformer = pipe.transformer.to(self.device_torch)
+        if self.model_config.low_vram:
+            pipe.transformer = pipe.transformer.to("cpu")
+        else:
+            pipe.transformer = pipe.transformer.to(self.device_torch)
         flush()
 
         # save it to the model class