Fix Qwen-Image long prompt dimension mismatch error (issue #12083)

robin-ede · robin-ede · commit d77f5f5c397c · 2025-08-06T18:28:08.000-05:00
- Add dynamic expansion capability to QwenEmbedRope pos_freqs buffer - Expand buffer when max_vid_index + max_len exceeds current size - Prevent RuntimeError when text prompts exceed 1024 tokens with large images - Add comprehensive test case for long prompt scenarios - Maintain backward compatibility with existing functionality Fixes: #12083
diff --git a/src/diffusers/models/transformers/transformer_qwenimage.py b/src/diffusers/models/transformers/transformer_qwenimage.py
@@ -160,24 +160,26 @@ def __init__(self, theta: int, axes_dim: List[int], scale_rope=False):
         super().__init__()
         self.theta = theta
         self.axes_dim = axes_dim
-        pos_index = torch.arange(1024)
-        neg_index = torch.arange(1024).flip(0) * -1 - 1
-        self.pos_freqs = torch.cat(
+        # Initialize with default size 1024, but allow dynamic expansion
+        self._current_max_len = 1024
+        pos_index = torch.arange(self._current_max_len)
+        neg_index = torch.arange(self._current_max_len).flip(0) * -1 - 1
+        self.register_buffer('pos_freqs', torch.cat(
             [
                 self.rope_params(pos_index, self.axes_dim[0], self.theta),
                 self.rope_params(pos_index, self.axes_dim[1], self.theta),
                 self.rope_params(pos_index, self.axes_dim[2], self.theta),
             ],
             dim=1,
-        )
-        self.neg_freqs = torch.cat(
+        ))
+        self.register_buffer('neg_freqs', torch.cat(
             [
                 self.rope_params(neg_index, self.axes_dim[0], self.theta),
                 self.rope_params(neg_index, self.axes_dim[1], self.theta),
                 self.rope_params(neg_index, self.axes_dim[2], self.theta),
             ],
             dim=1,
-        )
+        ))
         self.rope_cache = {}
 
         # 是否使用 scale rope
@@ -193,6 +195,45 @@ def rope_params(self, index, dim, theta=10000):
         freqs = torch.polar(torch.ones_like(freqs), freqs)
         return freqs
 
+    def _expand_pos_freqs_if_needed(self, required_len):
+        """Expand pos_freqs and neg_freqs if required length exceeds current size"""
+        if required_len <= self._current_max_len:
+            return
+        
+        # Calculate new size (use next power of 2 or round to nearest 512 for efficiency)
+        new_max_len = max(required_len, int((required_len + 511) // 512) * 512)
+        
+        # Generate expanded indices
+        pos_index = torch.arange(new_max_len, device=self.pos_freqs.device)
+        neg_index = torch.arange(new_max_len, device=self.neg_freqs.device).flip(0) * -1 - 1
+        
+        # Generate expanded frequency embeddings
+        new_pos_freqs = torch.cat(
+            [
+                self.rope_params(pos_index, self.axes_dim[0], self.theta),
+                self.rope_params(pos_index, self.axes_dim[1], self.theta),
+                self.rope_params(pos_index, self.axes_dim[2], self.theta),
+            ],
+            dim=1,
+        ).to(device=self.pos_freqs.device, dtype=self.pos_freqs.dtype)
+        
+        new_neg_freqs = torch.cat(
+            [
+                self.rope_params(neg_index, self.axes_dim[0], self.theta),
+                self.rope_params(neg_index, self.axes_dim[1], self.theta),
+                self.rope_params(neg_index, self.axes_dim[2], self.theta),
+            ],
+            dim=1,
+        ).to(device=self.neg_freqs.device, dtype=self.neg_freqs.dtype)
+        
+        # Update buffers
+        self.register_buffer('pos_freqs', new_pos_freqs)
+        self.register_buffer('neg_freqs', new_neg_freqs)
+        self._current_max_len = new_max_len
+        
+        # Clear cache since dimensions changed
+        self.rope_cache = {}
+
     def forward(self, video_fhw, txt_seq_lens, device):
         """
         Args: video_fhw: [frame, height, width] a list of 3 integers representing the shape of the video Args:
@@ -232,6 +273,11 @@ def forward(self, video_fhw, txt_seq_lens, device):
             max_vid_index = max(height, width)
 
         max_len = max(txt_seq_lens)
+        
+        # Expand pos_freqs if needed to accommodate max_vid_index + max_len
+        required_len = max_vid_index + max_len
+        self._expand_pos_freqs_if_needed(required_len)
+        
         txt_freqs = self.pos_freqs[max_vid_index : max_vid_index + max_len, ...]
 
         return vid_freqs, txt_freqs
diff --git a/tests/pipelines/qwenimage/test_qwenimage.py b/tests/pipelines/qwenimage/test_qwenimage.py
@@ -234,3 +234,40 @@ def test_vae_tiling(self, expected_diff_max: float = 0.2):
             expected_diff_max,
             "VAE tiling should not affect the inference results",
         )
+
+    def test_long_prompt_no_error(self):
+        # Test for issue #12083: long prompts should not cause dimension mismatch errors
+        device = torch_device
+        components = self.get_dummy_components()
+        pipe = self.pipeline_class(**components)
+        pipe.to(device)
+        
+        # Create a very long prompt that exceeds 1024 tokens when combined with image positioning
+        # Repeat a long phrase to simulate a real long prompt scenario
+        long_phrase = "A beautiful, detailed, high-resolution, photorealistic image showing "
+        long_prompt = (long_phrase * 50)[:1200]  # Ensure we exceed 1024 characters
+        
+        inputs = {
+            "prompt": long_prompt,
+            "generator": torch.Generator(device=device).manual_seed(0),
+            "num_inference_steps": 2,
+            "guidance_scale": 3.0,
+            "true_cfg_scale": 1.0,
+            "height": 32,  # Small size for fast test
+            "width": 32,   # Small size for fast test
+            "max_sequence_length": 1200,  # Allow long sequence
+            "output_type": "pt",
+        }
+        
+        # This should not raise a RuntimeError about tensor dimension mismatch
+        try:
+            output = pipe(**inputs)
+            # Basic sanity check that we got reasonable output
+            self.assertIsNotNone(output)
+            self.assertIsNotNone(output[0])
+        except RuntimeError as e:
+            if "must match the size of tensor" in str(e):
+                self.fail(f"Long prompt caused dimension mismatch error: {e}")
+            else:
+                # Re-raise other runtime errors that aren't related to our fix
+                raise