filipstrand
diff --git a/‎src/mflux/models/common/config/model_config.py‎
Lines changed: 18 additions & 2 deletions b/‎src/mflux/models/common/config/model_config.py‎
Lines changed: 18 additions & 2 deletions
diff --git a/‎src/mflux/models/common/schedulers/linear_scheduler.py‎
Lines changed: 15 additions & 12 deletions b/‎src/mflux/models/common/schedulers/linear_scheduler.py‎
Lines changed: 15 additions & 12 deletions
diff --git a/‎src/mflux/models/qwen/model/qwen_text_encoder/qwen_attention.py‎
Lines changed: 6 additions & 9 deletions b/‎src/mflux/models/qwen/model/qwen_text_encoder/qwen_attention.py‎
Lines changed: 6 additions & 9 deletions
diff --git a/‎src/mflux/models/qwen/model/qwen_text_encoder/qwen_vision_attention.py‎
Lines changed: 18 additions & 27 deletions b/‎src/mflux/models/qwen/model/qwen_text_encoder/qwen_vision_attention.py‎
Lines changed: 18 additions & 27 deletions
diff --git a/‎src/mflux/models/qwen/qwen_initializer.py‎
Lines changed: 1 addition & 1 deletion b/‎src/mflux/models/qwen/qwen_initializer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/mflux/models/qwen/weights/qwen_weight_definition.py‎
Lines changed: 2 additions & 1 deletion b/‎src/mflux/models/qwen/weights/qwen_weight_definition.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tests/resources/reference_qwen_edit.png‎
-4.79 KB b/‎tests/resources/reference_qwen_edit.png‎
-4.79 KB
diff --git a/‎tests/resources/reference_qwen_edit_multiple_images.png‎
-24.4 KB b/‎tests/resources/reference_qwen_edit_multiple_images.png‎
-24.4 KB
diff --git a/‎tests/resources/reference_qwen_img2img.png‎
4.51 KB b/‎tests/resources/reference_qwen_img2img.png‎
4.51 KB
diff --git a/‎tests/resources/reference_qwen_lora.png‎
28.1 KB b/‎tests/resources/reference_qwen_lora.png‎
28.1 KB
@@ -23,6 +23,11 @@ def __init__(
         requires_sigma_shift: bool | None,
         transformer_overrides: dict | None = None,
         text_encoder_overrides: dict | None = None,
+        sigma_base_shift: float = 0.5,
+        sigma_max_shift: float = 1.15,
+        sigma_base_seq_len: int = 256,
+        sigma_max_seq_len: int = 4096,
+        sigma_shift_terminal: float | None = None,
     ):
         self.aliases = aliases
         self.model_name = model_name
@@ -36,6 +41,11 @@ def __init__(
         self.priority = priority
         self.transformer_overrides = transformer_overrides or {}
         self.text_encoder_overrides = text_encoder_overrides or {}
+        self.sigma_base_shift = sigma_base_shift
+        self.sigma_max_shift = sigma_max_shift
+        self.sigma_base_seq_len = sigma_base_seq_len
+        self.sigma_max_seq_len = sigma_max_seq_len
+        self.sigma_shift_terminal = sigma_shift_terminal
 
     @staticmethod
     @lru_cache
@@ -411,7 +421,10 @@ def from_name(
         num_train_steps=None,
         max_sequence_length=None,
         supports_guidance=None,
-        requires_sigma_shift=None,
+        requires_sigma_shift=True,
+        sigma_max_shift=0.9,
+        sigma_max_seq_len=8192,
+        sigma_shift_terminal=0.02,
     ),
     "qwen-image-edit": ModelConfig(
         priority=16,
@@ -423,7 +436,10 @@ def from_name(
         num_train_steps=None,
         max_sequence_length=None,
         supports_guidance=None,
-        requires_sigma_shift=None,
+        requires_sigma_shift=True,
+        sigma_max_shift=0.9,
+        sigma_max_seq_len=8192,
+        sigma_shift_terminal=0.02,
     ),
     "fibo": ModelConfig(
         priority=17,
 
@@ -24,23 +24,26 @@ def timesteps(self) -> mx.array:
 
     def _get_sigmas(self) -> mx.array:
         model_config = self.config.model_config
-        sigmas = mx.linspace(
-            1.0,
-            1.0 / self.config.num_inference_steps,
-            self.config.num_inference_steps,
-        )
+        num_steps = self.config.num_inference_steps
+        sigmas = mx.linspace(1.0, 1.0 / num_steps, num_steps)
         sigmas = mx.array(sigmas).astype(mx.float32)
         sigmas = mx.concatenate([sigmas, mx.zeros(1)])
         if model_config.requires_sigma_shift:
-            y1 = 0.5
-            x1 = 256
-            m = (1.15 - y1) / (4096 - x1)
-            b = y1 - m * x1
+            m = (model_config.sigma_max_shift - model_config.sigma_base_shift) / (
+                model_config.sigma_max_seq_len - model_config.sigma_base_seq_len
+            )
+            b = model_config.sigma_base_shift - m * model_config.sigma_base_seq_len
             mu = m * self.config.width * self.config.height / 256 + b
             mu = mx.array(mu)
-            shifted_sigmas = mx.exp(mu) / (mx.exp(mu) + (1 / sigmas - 1))
-            shifted_sigmas[-1] = 0
-            return shifted_sigmas
+
+            shifted = mx.exp(mu) / (mx.exp(mu) + (1 / sigmas[:-1] - 1))
+
+            if model_config.sigma_shift_terminal is not None:
+                one_minus = 1.0 - shifted
+                scale = one_minus[-1] / (1.0 - model_config.sigma_shift_terminal)
+                shifted = 1.0 - (one_minus / scale)
+
+            return mx.concatenate([shifted, mx.zeros(1)])
         else:
             return sigmas
 
 
@@ -61,15 +61,12 @@ def __call__(
             key_states = QwenAttention._repeat_kv(key_states, self.num_key_value_groups)
             value_states = QwenAttention._repeat_kv(value_states, self.num_key_value_groups)
 
-        attn_weights = mx.matmul(query_states, key_states.transpose(0, 1, 3, 2)) * self.scaling
-
-        if attention_mask is not None:
-            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
-            attn_weights = attn_weights + causal_mask
-
-        # Softmax and output
-        attn_weights = mx.softmax(attn_weights.astype(mx.float32), axis=-1).astype(query_states.dtype)
-        attn_output = mx.matmul(attn_weights, value_states)
+        mask = attention_mask[:, :, :, : key_states.shape[-2]].astype(query_states.dtype) if attention_mask is not None else None
+        attn_output = mx.fast.scaled_dot_product_attention(
+            query_states, key_states, value_states,
+            scale=self.scaling,
+            mask=mask,
+        )
         attn_output = attn_output.transpose(0, 2, 1, 3).reshape(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
         return attn_output
 
@@ -18,10 +18,12 @@ def _rotate_half(self, x: mx.array) -> mx.array:
         return mx.concatenate([-x2, x1], axis=-1)
 
     def _apply_rope(self, x: mx.array, cos: mx.array, sin: mx.array) -> mx.array:
-        cos_expanded = mx.expand_dims(cos, axis=0)
-        sin_expanded = mx.expand_dims(sin, axis=0)
+        orig_dtype = x.dtype
+        x = x.astype(mx.float32)
+        cos_expanded = mx.expand_dims(cos, axis=0).astype(mx.float32)
+        sin_expanded = mx.expand_dims(sin, axis=0).astype(mx.float32)
         rotated = (x * cos_expanded) + (self._rotate_half(x) * sin_expanded)
-        return rotated
+        return rotated.astype(orig_dtype)
 
     def __call__(self, x: mx.array, position_embeddings=None, cu_seqlens=None) -> mx.array:
         seq_len, embed_dim = x.shape
@@ -42,41 +44,30 @@ def __call__(self, x: mx.array, position_embeddings=None, cu_seqlens=None) -> mx
             q = self._apply_rope(q, cos_emb, sin_emb)
             k = self._apply_rope(k, cos_emb, sin_emb)
 
+        scale = 1.0 / (self.head_dim**0.5)
+
         # Process attention chunks if cu_seqlens is provided (windowed attention)
         if cu_seqlens is not None and len(cu_seqlens) > 2:
-            # Split Q, K, V into chunks based on cu_seqlens
-            # cu_seqlens is cumulative, so lengths[i] = cu_seqlens[i+1] - cu_seqlens[i]
             lengths = [int((cu_seqlens[i + 1] - cu_seqlens[i]).item()) for i in range(len(cu_seqlens) - 1)]
 
-            # Split tensors (q,k,v are [heads, seq, head_dim])
-            q_chunks = []
-            k_chunks = []
-            v_chunks = []
+            attn_outputs = []
             offset = 0
             for length in lengths:
-                q_chunks.append(q[:, offset : offset + length, :])
-                k_chunks.append(k[:, offset : offset + length, :])
-                v_chunks.append(v[:, offset : offset + length, :])
+                q_chunk = mx.expand_dims(q[:, offset : offset + length, :], axis=0)
+                k_chunk = mx.expand_dims(k[:, offset : offset + length, :], axis=0)
+                v_chunk = mx.expand_dims(v[:, offset : offset + length, :], axis=0)
                 offset += length
+                out = mx.fast.scaled_dot_product_attention(q_chunk, k_chunk, v_chunk, scale=scale)
+                attn_outputs.append(out.squeeze(0))
 
-            # Process each chunk separately
-            attn_outputs = []
-            scale = 1.0 / (self.head_dim**0.5)
-            for q_chunk, k_chunk, v_chunk in zip(q_chunks, k_chunks, v_chunks):
-                # Compute attention for this chunk
-                scores = mx.matmul(q_chunk, k_chunk.transpose(0, 2, 1)) * scale
-                attn_weights = mx.softmax(scores, axis=-1)
-                attn_chunk = mx.matmul(attn_weights, v_chunk)  # [heads, chunk_len, head_dim]
-                attn_outputs.append(attn_chunk)
-
-            # Concatenate chunks back together
             attn_output = mx.concatenate(attn_outputs, axis=1)  # [heads, seq, head_dim]
         else:
             # Full attention (no chunking)
-            scale = 1.0 / (self.head_dim**0.5)
-            scores = mx.matmul(q, k.transpose(0, 2, 1)) * scale  # [heads, seq, seq]
-            attn_weights = mx.softmax(scores, axis=-1)
-            attn_output = mx.matmul(attn_weights, v)  # [heads, seq, head_dim]
+            q_4d = mx.expand_dims(q, axis=0)
+            k_4d = mx.expand_dims(k, axis=0)
+            v_4d = mx.expand_dims(v, axis=0)
+            attn_output = mx.fast.scaled_dot_product_attention(q_4d, k_4d, v_4d, scale=scale)
+            attn_output = attn_output.squeeze(0)  # [heads, seq, head_dim]
 
         # Reshape and project
         attn_output = attn_output.transpose(1, 0, 2).reshape(seq_len, embed_dim)  # [seq, embed_dim]
 
@@ -58,7 +58,7 @@ def init_edit(
         model.tokenizers["qwen_vl"] = QwenVisionLanguageTokenizer(
             processor=processor,
             max_length=1024,
-            use_picture_prefix=True,
+            use_picture_prefix=False,
         )
         model.qwen_vl_encoder = QwenVisionLanguageEncoder(encoder=model.text_encoder.encoder)
 
 
@@ -41,7 +41,8 @@ def get_tokenizers() -> List[TokenizerDefinition]:
                 hf_subdir="tokenizer",
                 tokenizer_class="Qwen2Tokenizer",
                 encoder_class=LanguageTokenizer,
-                max_length=1024,
+                max_length=1058,
+                padding="longest",
                 template="<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
                 download_patterns=["tokenizer/**", "added_tokens.json", "chat_template.jinja"],
             ),
Original file line number	Diff line number	Diff line change
`@@ -58,7 +58,7 @@ def init_edit(`
`58`	`58`	`model.tokenizers["qwen_vl"] = QwenVisionLanguageTokenizer(`
`59`	`59`	`processor=processor,`
`60`	`60`	`max_length=1024,`
`61`		`- use_picture_prefix=True,`
	`61`	`+ use_picture_prefix=False,`
`62`	`62`	`)`
`63`	`63`	`model.qwen_vl_encoder = QwenVisionLanguageEncoder(encoder=model.text_encoder.encoder)`
`64`	`64`