Make MPS support additive and preserve CUDA defaults

Kyle Butler · Kyle Butler · commit 5c38c0d64668 · 2026-02-21T00:08:39.000-05:00
diff --git a/config/examples/train_lora_flux_24gb.yaml b/config/examples/train_lora_flux_24gb.yaml
@@ -9,7 +9,7 @@ config:
       training_folder: "output"
       # uncomment to see performance stats in the terminal every N steps
 #      performance_log_every: 1000
-      device: mps
+      device: cuda:0
       # if a trigger word is specified, it will be added to captions of training data if it does not already exist
       # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
 #      trigger_word: "p3r5on"
@@ -45,7 +45,7 @@ config:
         train_text_encoder: false  # probably won't work with flux
         gradient_checkpointing: true  # need the on unless you have a ton of vram
         noise_scheduler: "flowmatch" # for training only
-        optimizer: "adamw" # adamw8bit not supported on mps
+        optimizer: "adamw8bit"
         lr: 1e-4
         # uncomment this to skip the pre training sample
 #        skip_first_sample: true
diff --git a/config/examples/train_lora_flux_mps.yaml b/config/examples/train_lora_flux_mps.yaml
@@ -0,0 +1,96 @@
+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: mps
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in         
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flux enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw" # adamw8bit not supported on mps
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "black-forest-labs/FLUX.1-dev"
+        is_flux: true
+        quantize: false  # 8-bit quantization backends are CUDA-only
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 20
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'
diff --git a/toolkit/custom_adapter.py b/toolkit/custom_adapter.py
@@ -31,7 +31,6 @@
 from toolkit.config_modules import AdapterConfig, AdapterTypes, TrainConfig
 from toolkit.prompt_utils import PromptEmbeds
 import weakref
-from toolkit import device_utils
 
 if TYPE_CHECKING:
     from toolkit.stable_diffusion_model import StableDiffusion
@@ -221,7 +220,8 @@ def setup_adapter(self):
         elif self.adapter_type == 'llm_adapter':
             kwargs = {}
             if self.config.quantize_llm:
-                if device_utils.is_mps_available():
+                current_device = torch.device(self.device)
+                if current_device.type == "mps":
                     print("Warning: BitsAndBytes 4-bit quantization is not supported on MPS. Disabling quantization for LLM adapter.")
                     self.config.quantize_llm = False
                 else:
diff --git a/toolkit/device_utils.py b/toolkit/device_utils.py
@@ -1,57 +1,69 @@
-import torch
 import gc
+from contextlib import nullcontext
+from typing import Optional, Union
+
+import torch
+
+
+def _as_torch_device(device: Optional[Union[str, torch.device]] = None) -> torch.device:
+    if device is None:
+        return get_device()
+    if isinstance(device, torch.device):
+        return device
+    return torch.device(device)
+
 
 def get_device() -> torch.device:
     """
     Returns the best available device.
-    Prioritizes MPS on macOS, then CUDA, then CPU.
+    Prioritizes CUDA, then MPS, then CPU.
     """
-    if torch.backends.mps.is_available():
-        return torch.device("mps")
-    elif torch.cuda.is_available():
+    if torch.cuda.is_available():
         return torch.device("cuda")
+    elif torch.backends.mps.is_available():
+        return torch.device("mps")
     else:
         return torch.device("cpu")
 
+
 def is_mps_available() -> bool:
     return torch.backends.mps.is_available()
 
+
 def is_cuda_available() -> bool:
     return torch.cuda.is_available()
 
-def empty_cache():
+
+def empty_cache(device: Optional[Union[str, torch.device]] = None):
     """
-    Empties the cache for the current device.
+    Empties the cache for the selected device.
     """
+    target_device = _as_torch_device(device)
     gc.collect()
-    if is_mps_available():
-        torch.mps.empty_cache()
-    elif is_cuda_available():
+    if target_device.type == "cuda" and is_cuda_available():
         torch.cuda.empty_cache()
+    elif target_device.type == "mps" and is_mps_available():
+        torch.mps.empty_cache()
 
-def manual_seed(seed: int):
+
+def manual_seed(seed: int, device: Optional[Union[str, torch.device]] = None):
     """
-    Sets the seed for the current device.
+    Sets global seed and device-specific seed when supported.
     """
+    target_device = _as_torch_device(device)
     torch.manual_seed(seed)
-    if is_mps_available():
-        torch.mps.manual_seed(seed)
-    elif is_cuda_available():
+    if target_device.type == "cuda" and is_cuda_available():
         torch.cuda.manual_seed(seed)
+    elif target_device.type == "mps" and is_mps_available():
+        torch.mps.manual_seed(seed)
 
-def get_device_name() -> str:
-    if is_mps_available():
-        return "mps"
-    elif is_cuda_available():
-        return "cuda"
-    else:
-        return "cpu"
 
-def autocast():
-    if is_mps_available():
-        return torch.autocast(device_type="mps")
-    elif is_cuda_available():
-        return torch.autocast(device_type="cuda")
-    else:
-        # Fallback to cpu or simple context manager
-        return torch.autocast(device_type="cpu")
+def get_device_name(device: Optional[Union[str, torch.device]] = None) -> str:
+    return _as_torch_device(device).type
+
+
+def autocast(device: Optional[Union[str, torch.device]] = None):
+    target_device = _as_torch_device(device)
+    if target_device.type in {"cuda", "mps", "cpu"}:
+        return torch.autocast(device_type=target_device.type)
+    return nullcontext()
diff --git a/toolkit/losses.py b/toolkit/losses.py
@@ -43,7 +43,7 @@ def forward(self, pred, target):
 
 # Gradient penalty
 def get_gradient_penalty(critic, real, fake, device):
-    with device_utils.autocast():
+    with device_utils.autocast(device):
         real = real.float()
         fake = fake.float()
         alpha = torch.rand(real.size(0), 1, 1, 1).to(device).float()
diff --git a/toolkit/optimizer.py b/toolkit/optimizer.py
@@ -61,9 +61,8 @@ def get_optimizer(
 
         optimizer = Adam8bit(params, lr=learning_rate, eps=1e-6, decouple=True, **optimizer_params)
     elif lower_type.endswith("8bit"):
-        # Force fallback on MPS as bitsandbytes requires CUDA
         from toolkit import device_utils
-        if device_utils.is_mps_available():
+        if device_utils.get_device_name() == "mps":
             print("Bitsandbytes 8-bit optimizers are not supported on MPS. Falling back to standard optimizer.")
             if lower_type == "adam8bit":
                 return torch.optim.Adam(params, lr=learning_rate, eps=1e-6, **optimizer_params)
@@ -79,33 +78,17 @@ def get_optimizer(
                 # Fallback for ademamix or unknown - generic AdamW
                 return torch.optim.AdamW(params, lr=learning_rate, eps=1e-6, **optimizer_params)
 
-        try:
-            import bitsandbytes
-            if lower_type == "adam8bit":
-                return bitsandbytes.optim.Adam8bit(params, lr=learning_rate, eps=1e-6, **optimizer_params)
-            if lower_type == "ademamix8bit":
-                return bitsandbytes.optim.AdEMAMix8bit(params, lr=learning_rate, eps=1e-6, **optimizer_params)
-            elif lower_type == "adamw8bit":
-                return bitsandbytes.optim.AdamW8bit(params, lr=learning_rate, eps=1e-6, **optimizer_params)
-            elif lower_type == "lion8bit":
-                return bitsandbytes.optim.Lion8bit(params, lr=learning_rate, **optimizer_params)
-            else:
-                raise ValueError(f'Unknown optimizer type {optimizer_type}')
-        except ImportError:
-            print("Bitsandbytes not found or not supported. Falling back to standard optimizer.")
-            if lower_type == "adam8bit":
-                return torch.optim.Adam(params, lr=learning_rate, eps=1e-6, **optimizer_params)
-            elif lower_type == "adamw8bit":
-                return torch.optim.AdamW(params, lr=learning_rate, eps=1e-6, **optimizer_params)
-            elif lower_type == "lion8bit":
-                try:
-                    from lion_pytorch import Lion
-                    return Lion(params, lr=learning_rate, **optimizer_params)
-                except ImportError:
-                    raise ImportError("Please install lion_pytorch to use Lion optimizer -> pip install lion-pytorch")
-            else:
-                # Fallback for ademamix or unknown - generic AdamW
-                return torch.optim.AdamW(params, lr=learning_rate, eps=1e-6, **optimizer_params)
+        import bitsandbytes
+        if lower_type == "adam8bit":
+            return bitsandbytes.optim.Adam8bit(params, lr=learning_rate, eps=1e-6, **optimizer_params)
+        if lower_type == "ademamix8bit":
+            return bitsandbytes.optim.AdEMAMix8bit(params, lr=learning_rate, eps=1e-6, **optimizer_params)
+        elif lower_type == "adamw8bit":
+            return bitsandbytes.optim.AdamW8bit(params, lr=learning_rate, eps=1e-6, **optimizer_params)
+        elif lower_type == "lion8bit":
+            return bitsandbytes.optim.Lion8bit(params, lr=learning_rate, **optimizer_params)
+        else:
+            raise ValueError(f'Unknown optimizer type {optimizer_type}')
     elif lower_type == 'adam':
         optimizer = torch.optim.Adam(params, lr=float(learning_rate), eps=1e-6, **optimizer_params)
     elif lower_type == 'adamw':
diff --git a/toolkit/stable_diffusion_model.py b/toolkit/stable_diffusion_model.py
@@ -28,7 +28,6 @@
 from toolkit.ip_adapter import IPAdapter
 from toolkit.util.vae import load_vae
 from toolkit import train_tools
-from toolkit import device_utils
 from toolkit.config_modules import ModelConfig, GenerateImageConfig, ModelArch
 from toolkit.metadata import get_meta_for_safetensors
 from toolkit.models.decorator import Decorator
@@ -479,7 +478,7 @@ def load_model(self):
             te_kwargs = {}
             # handle quantization of TE
             te_is_quantized = False
-            if device_utils.is_mps_available() and self.model_config.text_encoder_bits in [4, 8]:
+            if self.device_torch.type == "mps" and self.model_config.text_encoder_bits in [4, 8]:
                 print_acc("Warning: 4/8-bit quantization is not supported on MPS. Ignoring quantization.")
             else:
                 if self.model_config.text_encoder_bits == 8:
@@ -568,7 +567,7 @@ def load_model(self):
             te_kwargs = {}
             # handle quantization of TE
             te_is_quantized = False
-            if device_utils.is_mps_available() and self.model_config.text_encoder_bits in [4, 8]:
+            if self.device_torch.type == "mps" and self.model_config.text_encoder_bits in [4, 8]:
                 print_acc("Warning: 4/8-bit quantization is not supported on MPS. Ignoring quantization.")
             else:
                 if self.model_config.text_encoder_bits == 8:
@@ -951,7 +950,7 @@ def load_model(self):
                 te_kwargs = {}
                 # handle quantization of TE
                 te_is_quantized = False
-                if device_utils.is_mps_available() and self.model_config.text_encoder_bits in [4, 8]:
+                if self.device_torch.type == "mps" and self.model_config.text_encoder_bits in [4, 8]:
                     print_acc("Warning: 4/8-bit quantization is not supported on MPS. Ignoring quantization.")
                 else:
                     if self.model_config.text_encoder_bits == 8:
diff --git a/toolkit/train_tools.py b/toolkit/train_tools.py
@@ -658,8 +658,8 @@ class LearnableSNRGamma:
     This is a trainer for learnable snr gamma
     It will adapt to the dataset and attempt to adjust the snr multiplier to balance the loss over the timesteps
     """
-    def __init__(self, noise_scheduler: Union['DDPMScheduler'], device=None):
-        self.device = device if device is not None else device_utils.get_device()
+    def __init__(self, noise_scheduler: Union['DDPMScheduler'], device='cuda'):
+        self.device = device
         self.noise_scheduler: Union['DDPMScheduler'] = noise_scheduler
         self.offset_1 = torch.nn.Parameter(torch.tensor(0.0, dtype=torch.float32, device=self.device))
         self.offset_2 = torch.nn.Parameter(torch.tensor(0.777, dtype=torch.float32, device=self.device))
diff --git a/toolkit/unloader.py b/toolkit/unloader.py
@@ -1,7 +1,6 @@
 import gc
 import torch
 from toolkit.basic import flush
-from toolkit.device_utils import is_mps_available
 from typing import TYPE_CHECKING
 
 
@@ -40,6 +39,8 @@ def unload_text_encoder(model: "BaseModel"):
     # we need to make it appear as a text encoder module without actually having one so all
     # to functions and what not will work.
 
+    is_mps = isinstance(model.device_torch, torch.device) and model.device_torch.type == "mps"
+
     if model.text_encoder is not None:
         if isinstance(model.text_encoder, list):
             text_encoder_list = []
@@ -51,7 +52,7 @@ def unload_text_encoder(model: "BaseModel"):
                 text_encoder_list.append(te)
                 # if we are on mps, we don't want to move to cpu because it's unified memory
                 # and just freeing the reference is enough and faster
-                if not is_mps_available():
+                if not is_mps:
                     pipe.text_encoder.to('cpu')
                 else:
                     pipe.text_encoder.to('meta')
@@ -61,18 +62,18 @@ def unload_text_encoder(model: "BaseModel"):
             while hasattr(pipe, f"text_encoder_{i}"):
                 te = FakeTextEncoder(device=model.device_torch, dtype=model.torch_dtype)
                 text_encoder_list.append(te)
-                if is_mps_available():
+                if is_mps:
                     getattr(pipe, f"text_encoder_{i}").to('meta')
                 setattr(pipe, f"text_encoder_{i}", te)
                 i += 1
             model.text_encoder = text_encoder_list
         else:
             # only has a single text encoder
-            if is_mps_available():
+            if is_mps:
                 model.text_encoder.to('meta')
             model.text_encoder = FakeTextEncoder(device=model.device_torch, dtype=model.torch_dtype)
 
-    if torch.backends.mps.is_available():
+    if is_mps:
         gc.collect()
         torch.mps.empty_cache()
 
diff --git a/ui/src/app/jobs/new/AdvancedJob.tsx b/ui/src/app/jobs/new/AdvancedJob.tsx
diff --git a/ui/src/app/jobs/new/jobConfig.ts b/ui/src/app/jobs/new/jobConfig.ts