@@ -66,12 +66,16 @@ def _prepare_regional_attn_mask(
6666 ) -> torch .Tensor | None :
6767 """Prepare a regional attention mask for Z-Image.
6868
69- The mask controls which tokens can attend to each other :
70- - Image tokens within a region attend only to each other
69+ This uses an 'unrestricted' image self-attention approach (similar to FLUX) :
70+ - Image tokens can attend to ALL other image tokens (unrestricted self-attention)
7171 - Image tokens attend only to their corresponding regional text
7272 - Text tokens attend only to their corresponding regional image
7373 - Text tokens attend to themselves
7474
75+ The unrestricted image self-attention allows the model to maintain global
76+ coherence across regions, preventing the generation of separate/disconnected
77+ images for each region.
78+
7579 Z-Image sequence order: [img_tokens, txt_tokens]
7680
7781 Args:
@@ -129,12 +133,6 @@ def _prepare_regional_attn_mask(
129133 # 3. txt attends to corresponding regional img
130134 # Reshape mask to (1, img_seq_len) for broadcasting
131135 regional_attention_mask [txt_start :txt_end , :img_seq_len ] = mask_flat .view (1 , img_seq_len )
132-
133- # 4. img self-attention within region
134- # mask @ mask.T creates pairwise attention within the masked region
135- regional_attention_mask [:img_seq_len , :img_seq_len ] += mask_flat .view (img_seq_len , 1 ) @ mask_flat .view (
136- 1 , img_seq_len
137- )
138136 else :
139137 # Global prompt: allow attention to/from background regions only
140138 if background_region_mask is not None :
@@ -152,10 +150,10 @@ def _prepare_regional_attn_mask(
152150 regional_attention_mask [:img_seq_len , txt_start :txt_end ] = 1.0
153151 regional_attention_mask [txt_start :txt_end , :img_seq_len ] = 1.0
154152
155- # Allow background regions to attend to themselves
156- if background_region_mask is not None :
157- bg_mask = background_region_mask . view ( img_seq_len , 1 )
158- regional_attention_mask [:img_seq_len , :img_seq_len ] += bg_mask @ bg_mask . T
153+ # 4. Allow unrestricted image self-attention
154+ # This is the key difference from the restricted approach - all image tokens
155+ # can attend to each other, which helps maintain global coherence across regions
156+ regional_attention_mask [:img_seq_len , :img_seq_len ] = 1.0
159157
160158 # Convert to boolean mask
161159 regional_attention_mask = regional_attention_mask > 0.5
0 commit comments