Add link to ViT-22B paper as reference for parallel transformer blocks such as the Flux 2 single stream block

dg845 · dg845 · commit 7acd7dad550e · 2025-11-15T03:48:46.000+01:00
diff --git a/src/diffusers/models/transformers/transformer_flux2.py b/src/diffusers/models/transformers/transformer_flux2.py
@@ -321,7 +321,8 @@ def __init__(
         self.norm = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
 
         # Note that the MLP in/out linear layers are fused with the attention QKV/out projections, respectively; this
-        # is often called a "parallel" transformer block
+        # is often called a "parallel" transformer block. See the [ViT-22B paper](https://arxiv.org/abs/2302.05442)
+        # for a visual depiction of this type of transformer block.
         self.attn = Flux2Attention(
             query_dim=dim,
             dim_head=attention_head_dim,