invoke-ai
diff --git a/‎invokeai/app/invocations/cogview4_text_encoder.py‎
Lines changed: 3 additions & 2 deletions b/‎invokeai/app/invocations/cogview4_text_encoder.py‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎invokeai/app/invocations/compel.py‎
Lines changed: 2 additions & 2 deletions b/‎invokeai/app/invocations/compel.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎invokeai/app/invocations/flux_text_encoder.py‎
Lines changed: 6 additions & 0 deletions b/‎invokeai/app/invocations/flux_text_encoder.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎invokeai/app/invocations/sd3_text_encoder.py‎
Lines changed: 11 additions & 3 deletions b/‎invokeai/app/invocations/sd3_text_encoder.py‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎invokeai/app/invocations/z_image_text_encoder.py‎
Lines changed: 5 additions & 1 deletion b/‎invokeai/app/invocations/z_image_text_encoder.py‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎invokeai/app/services/model_records/model_records_base.py‎
Lines changed: 1 addition & 0 deletions b/‎invokeai/app/services/model_records/model_records_base.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎invokeai/backend/flux/modules/conditioner.py‎
Lines changed: 4 additions & 3 deletions b/‎invokeai/backend/flux/modules/conditioner.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎invokeai/backend/model_manager/configs/clip_embed.py‎
Lines changed: 1 addition & 0 deletions b/‎invokeai/backend/model_manager/configs/clip_embed.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎invokeai/backend/model_manager/configs/clip_vision.py‎
Lines changed: 1 addition & 0 deletions b/‎invokeai/backend/model_manager/configs/clip_vision.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎invokeai/backend/model_manager/configs/llava_onevision.py‎
Lines changed: 1 addition & 0 deletions b/‎invokeai/backend/model_manager/configs/llava_onevision.py‎
Lines changed: 1 addition & 0 deletions
@@ -10,7 +10,6 @@
     CogView4ConditioningInfo,
     ConditioningFieldData,
 )
-from invokeai.backend.util.devices import TorchDevice
 
 # The CogView4 GLM Text Encoder max sequence length set based on the default in diffusers.
 COGVIEW4_GLM_MAX_SEQ_LEN = 1024
@@ -37,6 +36,8 @@ class CogView4TextEncoderInvocation(BaseInvocation):
     @torch.no_grad()
     def invoke(self, context: InvocationContext) -> CogView4ConditioningOutput:
         glm_embeds = self._glm_encode(context, max_seq_len=COGVIEW4_GLM_MAX_SEQ_LEN)
+        # Move embeddings to CPU for storage to save VRAM
+        glm_embeds = glm_embeds.detach().to("cpu")
         conditioning_data = ConditioningFieldData(conditionings=[CogView4ConditioningInfo(glm_embeds=glm_embeds)])
         conditioning_name = context.conditioning.save(conditioning_data)
         return CogView4ConditioningOutput.build(conditioning_name)
@@ -85,7 +86,7 @@ def _glm_encode(self, context: InvocationContext, max_seq_len: int) -> torch.Ten
                 )
                 text_input_ids = torch.cat([pad_ids, text_input_ids], dim=1)
             prompt_embeds = glm_text_encoder(
-                text_input_ids.to(TorchDevice.choose_torch_device()), output_hidden_states=True
+                text_input_ids.to(glm_text_encoder.device), output_hidden_states=True
             ).hidden_states[-2]
 
         assert isinstance(prompt_embeds, torch.Tensor)
 
@@ -103,7 +103,7 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]:
                 textual_inversion_manager=ti_manager,
                 dtype_for_device_getter=TorchDevice.choose_torch_dtype,
                 truncate_long_prompts=False,
-                device=TorchDevice.choose_torch_device(),
+                device=text_encoder.device,  # Use the device the model is actually on
                 split_long_text_mode=SplitLongTextMode.SENTENCES,
             )
 
@@ -212,7 +212,7 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]:
                 truncate_long_prompts=False,  # TODO:
                 returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,  # TODO: clip skip
                 requires_pooled=get_pooled,
-                device=TorchDevice.choose_torch_device(),
+                device=text_encoder.device,  # Use the device the model is actually on
                 split_long_text_mode=SplitLongTextMode.SENTENCES,
             )
 
 
@@ -58,6 +58,12 @@ def invoke(self, context: InvocationContext) -> FluxConditioningOutput:
         # scoped. This ensures that the T5 model can be freed and gc'd before loading the CLIP model (if necessary).
         t5_embeddings = self._t5_encode(context)
         clip_embeddings = self._clip_encode(context)
+
+        # Move embeddings to CPU for storage to save VRAM
+        # They will be moved to the appropriate device when used by the denoiser
+        t5_embeddings = t5_embeddings.detach().to("cpu")
+        clip_embeddings = clip_embeddings.detach().to("cpu")
+
         conditioning_data = ConditioningFieldData(
             conditionings=[FLUXConditioningInfo(clip_embeds=clip_embeddings, t5_embeds=t5_embeddings)]
         )
 
@@ -21,7 +21,6 @@
 from invokeai.backend.patches.lora_conversions.flux_lora_constants import FLUX_LORA_CLIP_PREFIX
 from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningFieldData, SD3ConditioningInfo
-from invokeai.backend.util.devices import TorchDevice
 
 # The SD3 T5 Max Sequence Length set based on the default in diffusers.
 SD3_T5_MAX_SEQ_LEN = 256
@@ -69,6 +68,15 @@ def invoke(self, context: InvocationContext) -> SD3ConditioningOutput:
         if self.t5_encoder is not None:
             t5_embeddings = self._t5_encode(context, SD3_T5_MAX_SEQ_LEN)
 
+        # Move all embeddings to CPU for storage to save VRAM
+        # They will be moved to the appropriate device when used by the denoiser
+        clip_l_embeddings = clip_l_embeddings.detach().to("cpu")
+        clip_l_pooled_embeddings = clip_l_pooled_embeddings.detach().to("cpu")
+        clip_g_embeddings = clip_g_embeddings.detach().to("cpu")
+        clip_g_pooled_embeddings = clip_g_pooled_embeddings.detach().to("cpu")
+        if t5_embeddings is not None:
+            t5_embeddings = t5_embeddings.detach().to("cpu")
+
         conditioning_data = ConditioningFieldData(
             conditionings=[
                 SD3ConditioningInfo(
@@ -117,7 +125,7 @@ def _t5_encode(self, context: InvocationContext, max_seq_len: int) -> torch.Tens
                     f" {max_seq_len} tokens: {removed_text}"
                 )
 
-            prompt_embeds = t5_text_encoder(text_input_ids.to(TorchDevice.choose_torch_device()))[0]
+            prompt_embeds = t5_text_encoder(text_input_ids.to(t5_text_encoder.device))[0]
 
         assert isinstance(prompt_embeds, torch.Tensor)
         return prompt_embeds
@@ -180,7 +188,7 @@ def _clip_encode(
                     f" {tokenizer_max_length} tokens: {removed_text}"
                 )
             prompt_embeds = clip_text_encoder(
-                input_ids=text_input_ids.to(TorchDevice.choose_torch_device()), output_hidden_states=True
+                input_ids=text_input_ids.to(clip_text_encoder.device), output_hidden_states=True
             )
             pooled_prompt_embeds = prompt_embeds[0]
             prompt_embeds = prompt_embeds.hidden_states[-2]
 
@@ -57,6 +57,8 @@ class ZImageTextEncoderInvocation(BaseInvocation):
     @torch.no_grad()
     def invoke(self, context: InvocationContext) -> ZImageConditioningOutput:
         prompt_embeds = self._encode_prompt(context, max_seq_len=Z_IMAGE_MAX_SEQ_LEN)
+        # Move embeddings to CPU for storage to save VRAM
+        prompt_embeds = prompt_embeds.detach().to("cpu")
         conditioning_data = ConditioningFieldData(conditionings=[ZImageConditioningInfo(prompt_embeds=prompt_embeds)])
         conditioning_name = context.conditioning.save(conditioning_data)
         return ZImageConditioningOutput(
@@ -69,7 +71,6 @@ def _encode_prompt(self, context: InvocationContext, max_seq_len: int) -> torch.
         Based on the ZImagePipeline._encode_prompt method from diffusers.
         """
         prompt = self.prompt
-        device = TorchDevice.choose_torch_device()
 
         text_encoder_info = context.models.load(self.qwen3_encoder.text_encoder)
         tokenizer_info = context.models.load(self.qwen3_encoder.tokenizer)
@@ -78,6 +79,9 @@ def _encode_prompt(self, context: InvocationContext, max_seq_len: int) -> torch.
             (_, text_encoder) = exit_stack.enter_context(text_encoder_info.model_on_device())
             (_, tokenizer) = exit_stack.enter_context(tokenizer_info.model_on_device())
 
+            # Use the device that the text_encoder is actually on
+            device = text_encoder.device
+
             # Apply LoRA models to the text encoder
             lora_dtype = TorchDevice.choose_bfloat16_safe_dtype(device)
             exit_stack.enter_context(
 
@@ -88,6 +88,7 @@ class ModelRecordChanges(BaseModelExcludeNull):
     default_settings: Optional[MainModelDefaultSettings | LoraModelDefaultSettings | ControlAdapterDefaultSettings] = (
         Field(description="Default settings for this model", default=None)
     )
+    cpu_only: Optional[bool] = Field(description="Whether this model should run on CPU only", default=None)
 
     # Checkpoint-specific changes
     # TODO(MM2): Should we expose these? Feels footgun-y...
 
@@ -3,8 +3,6 @@
 from torch import Tensor, nn
 from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast
 
-from invokeai.backend.util.devices import TorchDevice
-
 
 class HFEncoder(nn.Module):
     def __init__(
@@ -33,8 +31,11 @@ def forward(self, text: list[str]) -> Tensor:
             return_tensors="pt",
         )
 
+        # Move inputs to the same device as the model to support cpu_only models
+        model_device = next(self.hf_module.parameters()).device
+
         outputs = self.hf_module(
-            input_ids=batch_encoding["input_ids"].to(TorchDevice.choose_torch_device()),
+            input_ids=batch_encoding["input_ids"].to(model_device),
             attention_mask=None,
             output_hidden_states=False,
         )
 
@@ -41,6 +41,7 @@ class CLIPEmbed_Diffusers_Config_Base(Diffusers_Config_Base):
     base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
     type: Literal[ModelType.CLIPEmbed] = Field(default=ModelType.CLIPEmbed)
     format: Literal[ModelFormat.Diffusers] = Field(default=ModelFormat.Diffusers)
+    cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")
 
     @classmethod
     def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
 
@@ -28,6 +28,7 @@ class CLIPVision_Diffusers_Config(Diffusers_Config_Base, Config_Base):
     base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
     type: Literal[ModelType.CLIPVision] = Field(default=ModelType.CLIPVision)
     format: Literal[ModelFormat.Diffusers] = Field(default=ModelFormat.Diffusers)
+    cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")
 
     @classmethod
     def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
 
@@ -25,6 +25,7 @@ class LlavaOnevision_Diffusers_Config(Diffusers_Config_Base, Config_Base):
 
     type: Literal[ModelType.LlavaOnevision] = Field(default=ModelType.LlavaOnevision)
     base: Literal[BaseModelType.Any] = Field(default=BaseModelType.Any)
+    cpu_only: bool | None = Field(default=None, description="Whether this model should run on CPU only")
 
     @classmethod
     def from_model_on_disk(cls, mod: ModelOnDisk, override_fields: dict[str, Any]) -> Self:
Original file line number	Diff line number	Diff line change
`@@ -88,6 +88,7 @@ class ModelRecordChanges(BaseModelExcludeNull):`
`88`	`88`	`default_settings: Optional[MainModelDefaultSettings \| LoraModelDefaultSettings \| ControlAdapterDefaultSettings] = (`
`89`	`89`	`Field(description="Default settings for this model", default=None)`
`90`	`90`	`)`
	`91`	`+ cpu_only: Optional[bool] = Field(description="Whether this model should run on CPU only", default=None)`
`91`	`92`
`92`	`93`	`# Checkpoint-specific changes`
`93`	`94`	`# TODO(MM2): Should we expose these? Feels footgun-y...`