Merge pull request #2268 from kohya-ss/sd3

kohya-ss · web-flow · commit 48d368fa5577 · 2026-02-16T08:07:29.000+09:00
merge sd3 to main
diff --git a/library/deepspeed_utils.py b/library/deepspeed_utils.py
@@ -96,7 +96,7 @@ def prepare_deepspeed_plugin(args: argparse.Namespace):
     deepspeed_plugin.deepspeed_config["train_batch_size"] = (
         args.train_batch_size * args.gradient_accumulation_steps * int(os.environ["WORLD_SIZE"])
     )
-    
+
     deepspeed_plugin.set_mixed_precision(args.mixed_precision)
     if args.mixed_precision.lower() == "fp16":
         deepspeed_plugin.deepspeed_config["fp16"]["initial_scale_power"] = 0  # preventing overflow.
@@ -125,18 +125,18 @@ def prepare_deepspeed_model(args: argparse.Namespace, **models):
     class DeepSpeedWrapper(torch.nn.Module):
         def __init__(self, **kw_models) -> None:
             super().__init__()
-            
+
             self.models = torch.nn.ModuleDict()
-            
-            wrap_model_forward_with_torch_autocast = args.mixed_precision is not "no"
+
+            wrap_model_forward_with_torch_autocast = args.mixed_precision != "no"
 
             for key, model in kw_models.items():
                 if isinstance(model, list):
                     model = torch.nn.ModuleList(model)
-                                            
+
                 if wrap_model_forward_with_torch_autocast:
-                    model = self.__wrap_model_with_torch_autocast(model)  
-                
+                    model = self.__wrap_model_with_torch_autocast(model)
+
                 assert isinstance(
                     model, torch.nn.Module
                 ), f"model must be an instance of torch.nn.Module, but got {key} is {type(model)}"
@@ -151,7 +151,7 @@ def __wrap_model_with_torch_autocast(self, model):
             return model
 
         def __wrap_model_forward_with_torch_autocast(self, model):
-            
+
             assert hasattr(model, "forward"), f"model must have a forward method."
 
             forward_fn = model.forward
@@ -161,20 +161,19 @@ def forward(*args, **kwargs):
                     device_type = model.device.type
                 except AttributeError:
                     logger.warning(
-                            "[DeepSpeed] model.device is not available. Using get_preferred_device() "
-                            "to determine the device_type for torch.autocast()."
-                    )                    
+                        "[DeepSpeed] model.device is not available. Using get_preferred_device() "
+                        "to determine the device_type for torch.autocast()."
+                    )
                     device_type = get_preferred_device().type
 
-                with torch.autocast(device_type = device_type):
+                with torch.autocast(device_type=device_type):
                     return forward_fn(*args, **kwargs)
 
             model.forward = forward
             return model
-        
+
         def get_models(self):
             return self.models
-        
 
     ds_model = DeepSpeedWrapper(**models)
     return ds_model
diff --git a/library/lumina_models.py b/library/lumina_models.py
@@ -34,18 +34,18 @@
 try:
     from flash_attn import flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-except:
+except ImportError:
     # flash_attn may not be available but it is not required
     pass
 
 try:
     from sageattention import sageattn
-except:
+except ImportError:
     pass
 
 try:
     from apex.normalization import FusedRMSNorm as RMSNorm
-except:
+except ImportError:
     import warnings
 
     warnings.warn("Cannot import apex RMSNorm, switch to vanilla implementation")
@@ -98,7 +98,7 @@ def forward(self, x: Tensor):
             x_dtype = x.dtype
             # To handle float8 we need to convert the tensor to float
             x = x.float()
-            rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+            rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + self.eps)
             return ((x * rrms) * self.weight.float()).to(dtype=x_dtype)
 
 
@@ -370,7 +370,7 @@ def forward(
         if self.use_sage_attn:
             # Handle GQA (Grouped Query Attention) if needed
             n_rep = self.n_local_heads // self.n_local_kv_heads
-            if n_rep >= 1:
+            if n_rep > 1:
                 xk = xk.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
                 xv = xv.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
 
@@ -379,7 +379,7 @@ def forward(
             output = self.flash_attn(xq, xk, xv, x_mask, softmax_scale)
         else:
             n_rep = self.n_local_heads // self.n_local_kv_heads
-            if n_rep >= 1:
+            if n_rep > 1:
                 xk = xk.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
                 xv = xv.unsqueeze(3).repeat(1, 1, 1, n_rep, 1).flatten(2, 3)
 
@@ -456,51 +456,47 @@ def sage_attn(self, q: Tensor, k: Tensor, v: Tensor, x_mask: Tensor, softmax_sca
             bsz = q.shape[0]
             seqlen = q.shape[1]
 
-            # Transpose tensors to match SageAttention's expected format (HND layout)
-            q_transposed = q.permute(0, 2, 1, 3)  # [batch, heads, seq_len, head_dim]
-            k_transposed = k.permute(0, 2, 1, 3)  # [batch, heads, seq_len, head_dim]
-            v_transposed = v.permute(0, 2, 1, 3)  # [batch, heads, seq_len, head_dim]
-            
-            # Handle masking for SageAttention
-            # We need to filter out masked positions - this approach handles variable sequence lengths
-            outputs = []
-            for b in range(bsz):
-                # Find valid token positions from the mask
-                valid_indices = torch.nonzero(x_mask[b], as_tuple=False).squeeze(-1)
-                if valid_indices.numel() == 0:
-                    # If all tokens are masked, create a zero output
-                    batch_output = torch.zeros(
-                        seqlen, self.n_local_heads, self.head_dim, 
-                        device=q.device, dtype=q.dtype
-                    )
-                else:
-                    # Extract only valid tokens for this batch
-                    batch_q = q_transposed[b, :, valid_indices, :]
-                    batch_k = k_transposed[b, :, valid_indices, :]
-                    batch_v = v_transposed[b, :, valid_indices, :]
-                    
-                    # Run SageAttention on valid tokens only
+            # Transpose to SageAttention's expected HND layout: [batch, heads, seq_len, head_dim]
+            q_transposed = q.permute(0, 2, 1, 3)
+            k_transposed = k.permute(0, 2, 1, 3)
+            v_transposed = v.permute(0, 2, 1, 3)
+
+            # Fast path: if all tokens are valid, run batched SageAttention directly
+            if x_mask.all():
+                output = sageattn(
+                    q_transposed, k_transposed, v_transposed,
+                    tensor_layout="HND", is_causal=False, sm_scale=softmax_scale,
+                )
+                # output: [batch, heads, seq_len, head_dim] -> [batch, seq_len, heads, head_dim]
+                output = output.permute(0, 2, 1, 3)
+            else:
+                # Slow path: per-batch loop to handle variable-length masking
+                # SageAttention does not support attention masks natively
+                outputs = []
+                for b in range(bsz):
+                    valid_indices = x_mask[b].nonzero(as_tuple=True)[0]
+                    if valid_indices.numel() == 0:
+                        outputs.append(torch.zeros(
+                            seqlen, self.n_local_heads, self.head_dim,
+                            device=q.device, dtype=q.dtype,
+                        ))
+                        continue
+
                     batch_output_valid = sageattn(
-                        batch_q.unsqueeze(0),  # Add batch dimension back
-                        batch_k.unsqueeze(0), 
-                        batch_v.unsqueeze(0), 
-                        tensor_layout="HND",
-                        is_causal=False,
-                        sm_scale=softmax_scale
+                        q_transposed[b:b+1, :, valid_indices, :],
+                        k_transposed[b:b+1, :, valid_indices, :],
+                        v_transposed[b:b+1, :, valid_indices, :],
+                        tensor_layout="HND", is_causal=False, sm_scale=softmax_scale,
                     )
-                    
-                    # Create output tensor with zeros for masked positions
+
                     batch_output = torch.zeros(
-                        seqlen, self.n_local_heads, self.head_dim, 
-                        device=q.device, dtype=q.dtype
+                        seqlen, self.n_local_heads, self.head_dim,
+                        device=q.device, dtype=q.dtype,
                     )
-                    # Place valid outputs back in the right positions
                     batch_output[valid_indices] = batch_output_valid.squeeze(0).permute(1, 0, 2)
-                    
-                outputs.append(batch_output)
-            
-            # Stack batch outputs and reshape to expected format
-            output = torch.stack(outputs, dim=0)  # [batch, seq_len, heads, head_dim]
+                    outputs.append(batch_output)
+
+                output = torch.stack(outputs, dim=0)
         except NameError as e:
             raise RuntimeError(
                 f"Could not load Sage Attention. Please install https://github.com/thu-ml/SageAttention. / Sage Attention を読み込めませんでした。https://github.com/thu-ml/SageAttention をインストールしてください。 / {e}"
@@ -1113,10 +1109,9 @@ def patchify_and_embed(
 
         x = x.view(bsz, channels, height // pH, pH, width // pW, pW).permute(0, 2, 4, 3, 5, 1).flatten(3).flatten(1, 2)
 
-        x_mask = torch.zeros(bsz, image_seq_len, dtype=torch.bool, device=device)
-        for i in range(bsz):
-            x[i, :image_seq_len] = x[i]
-            x_mask[i, :image_seq_len] = True
+        # x.shape[1] == image_seq_len after patchify, so this was assigning to itself.
+        # The mask can be set without a loop since all samples have the same image_seq_len.
+        x_mask = torch.ones(bsz, image_seq_len, dtype=torch.bool, device=device)
 
         x = self.x_embedder(x)
 
@@ -1389,4 +1384,4 @@ def NextDiT_7B_GQA_patch2_Adaln_Refiner(**kwargs):
         axes_dims=[40, 40, 40],
         axes_lens=[300, 512, 512],
         **kwargs,
-    )
+    )
diff --git a/library/lumina_train_util.py b/library/lumina_train_util.py
@@ -334,32 +334,35 @@ def sample_image_inference(
 
         # No need to add system prompt here, as it has been handled in the tokenize_strategy
 
-        # Get sample prompts from cache
+        # Get sample prompts from cache, fallback to live encoding
+        gemma2_conds = None
+        neg_gemma2_conds = None
+
         if sample_prompts_gemma2_outputs and prompt in sample_prompts_gemma2_outputs:
             gemma2_conds = sample_prompts_gemma2_outputs[prompt]
             logger.info(f"Using cached Gemma2 outputs for prompt: {prompt}")
 
-        if (
-            sample_prompts_gemma2_outputs
-            and negative_prompt in sample_prompts_gemma2_outputs
-        ):
+        if sample_prompts_gemma2_outputs and negative_prompt in sample_prompts_gemma2_outputs:
             neg_gemma2_conds = sample_prompts_gemma2_outputs[negative_prompt]
-            logger.info(
-                f"Using cached Gemma2 outputs for negative prompt: {negative_prompt}"
-            )
+            logger.info(f"Using cached Gemma2 outputs for negative prompt: {negative_prompt}")
 
-        # Load sample prompts from Gemma 2
-        if gemma2_model is not None:
+        # Only encode if not found in cache
+        if gemma2_conds is None and gemma2_model is not None:
             tokens_and_masks = tokenize_strategy.tokenize(prompt)
             gemma2_conds = encoding_strategy.encode_tokens(
                 tokenize_strategy, gemma2_model, tokens_and_masks
             )
 
+        if neg_gemma2_conds is None and gemma2_model is not None:
             tokens_and_masks = tokenize_strategy.tokenize(negative_prompt, is_negative=True)
             neg_gemma2_conds = encoding_strategy.encode_tokens(
                 tokenize_strategy, gemma2_model, tokens_and_masks
             )
 
+        if gemma2_conds is None or neg_gemma2_conds is None:
+            logger.error(f"Cannot generate sample: no cached outputs and no text encoder available for prompt: {prompt}")
+            continue
+
         # Unpack Gemma2 outputs
         gemma2_hidden_states, _, gemma2_attn_mask = gemma2_conds
         neg_gemma2_hidden_states, _, neg_gemma2_attn_mask = neg_gemma2_conds
@@ -475,6 +478,7 @@ def sample_image_inference(
 
 
 def time_shift(mu: float, sigma: float, t: torch.Tensor):
+    """Apply time shifting to timesteps."""
     t = math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
     return t
 
@@ -483,7 +487,7 @@ def get_lin_function(
     x1: float = 256, x2: float = 4096, y1: float = 0.5, y2: float = 1.15
 ) -> Callable[[float], float]:
     """
-    Get linear function
+    Get linear function for resolution-dependent shifting.
 
     Args:
         image_seq_len,
@@ -528,6 +532,7 @@ def get_schedule(
         mu = get_lin_function(y1=base_shift, y2=max_shift, x1=256, x2=4096)(
             image_seq_len
         )
+        timesteps = torch.clamp(timesteps, min=1e-7).to(timesteps.device)
         timesteps = time_shift(mu, 1.0, timesteps)
 
     return timesteps.tolist()
@@ -689,15 +694,15 @@ def denoise(
 
         img_dtype = img.dtype
 
-        if img.dtype != img_dtype:
-            if torch.backends.mps.is_available():
-                # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
-                img = img.to(img_dtype)
-
         # compute the previous noisy sample x_t -> x_t-1
         noise_pred = -noise_pred
         img = scheduler.step(noise_pred, t, img, return_dict=False)[0]
 
+        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+        if img.dtype != img_dtype:
+            if torch.backends.mps.is_available():
+                img = img.to(img_dtype)
+
     model.prepare_block_swap_before_forward()
     return img
 
@@ -823,6 +828,7 @@ def get_noisy_model_input_and_timesteps(
         timesteps = sigmas * num_timesteps
     elif args.timestep_sampling == "nextdit_shift":
         sigmas = torch.rand((bsz,), device=device)
+        sigmas = torch.clamp(sigmas, min=1e-7).to(device)
         mu = get_lin_function(y1=0.5, y2=1.15)((h // 2) * (w // 2))
         sigmas = time_shift(mu, 1.0, sigmas)
 
@@ -831,6 +837,7 @@ def get_noisy_model_input_and_timesteps(
         sigmas = torch.randn(bsz, device=device)
         sigmas = sigmas * args.sigmoid_scale  # larger scale for more uniform sampling
         sigmas = sigmas.sigmoid()
+        sigmas = torch.clamp(sigmas, min=1e-7).to(device)
         mu = get_lin_function(y1=0.5, y2=1.15)((h // 2) * (w // 2))  # we are pre-packed so must adjust for packed size
         sigmas = time_shift(mu, 1.0, sigmas)
         timesteps = sigmas * num_timesteps
diff --git a/lumina_train.py b/lumina_train.py
@@ -370,19 +370,25 @@ def train(args):
         grouped_params = []
         param_group = {}
         for group in params_to_optimize:
-            named_parameters = list(nextdit.named_parameters())
+            named_parameters = [(n, p) for n, p in nextdit.named_parameters() if p.requires_grad]
             assert len(named_parameters) == len(
                 group["params"]
-            ), "number of parameters does not match"
+            ), f"number of trainable parameters ({len(named_parameters)}) does not match optimizer group ({len(group['params'])})"
             for p, np in zip(group["params"], named_parameters):
                 # determine target layer and block index for each parameter
-                block_type = "other"  # double, single or other
-                if np[0].startswith("double_blocks"):
+                # Lumina NextDiT architecture:
+                #   - "layers.{i}.*"           : main transformer blocks (e.g. 32 blocks for 2B)
+                #   - "context_refiner.{i}.*"  : context refiner blocks (2 blocks)
+                #   - "noise_refiner.{i}.*"    : noise refiner blocks (2 blocks)
+                #   - others: t_embedder, cap_embedder, x_embedder, norm_final, final_layer
+                block_type = "other"
+                if np[0].startswith("layers."):
                     block_index = int(np[0].split(".")[1])
-                    block_type = "double"
-                elif np[0].startswith("single_blocks"):
-                    block_index = int(np[0].split(".")[1])
-                    block_type = "single"
+                    block_type = "main"
+                elif np[0].startswith("context_refiner.") or np[0].startswith("noise_refiner."):
+                    # All refiner blocks (context + noise) grouped together
+                    block_index = -1
+                    block_type = "refiner"
                 else:
                     block_index = -1
 
@@ -759,7 +765,7 @@ def grad_hook(parameter: torch.Tensor):
 
                 # calculate loss
                 huber_c = train_util.get_huber_threshold_if_needed(
-                    args, timesteps, noise_scheduler
+                    args, 1000 - timesteps, noise_scheduler
                 )
                 loss = train_util.conditional_loss(
                     model_pred.float(), target.float(), args.loss_type, "none", huber_c
diff --git a/lumina_train_network.py b/lumina_train_network.py
diff --git a/networks/lora_lumina.py b/networks/lora_lumina.py