huggingface
diff --git a/‎src/diffusers/models/transformers/transformer_qwenimage.py‎
Lines changed: 31 additions & 24 deletions b/‎src/diffusers/models/transformers/transformer_qwenimage.py‎
Lines changed: 31 additions & 24 deletions
diff --git a/‎src/diffusers/modular_pipelines/__init__.py‎
Lines changed: 0 additions & 2 deletions b/‎src/diffusers/modular_pipelines/__init__.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/diffusers/modular_pipelines/flux/before_denoise.py‎
Lines changed: 7 additions & 31 deletions b/‎src/diffusers/modular_pipelines/flux/before_denoise.py‎
Lines changed: 7 additions & 31 deletions
diff --git a/‎src/diffusers/modular_pipelines/flux/decoders.py‎
Lines changed: 3 additions & 8 deletions b/‎src/diffusers/modular_pipelines/flux/decoders.py‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎src/diffusers/modular_pipelines/flux/denoise.py‎
Lines changed: 5 additions & 8 deletions b/‎src/diffusers/modular_pipelines/flux/denoise.py‎
Lines changed: 5 additions & 8 deletions
diff --git a/‎src/diffusers/modular_pipelines/flux/encoders.py‎
Lines changed: 6 additions & 7 deletions b/‎src/diffusers/modular_pipelines/flux/encoders.py‎
Lines changed: 6 additions & 7 deletions
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 
+import functools
 import math
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -162,15 +163,15 @@ def __init__(self, theta: int, axes_dim: List[int], scale_rope=False):
         self.axes_dim = axes_dim
         pos_index = torch.arange(1024)
         neg_index = torch.arange(1024).flip(0) * -1 - 1
-        self.pos_freqs = torch.cat(
+        pos_freqs = torch.cat(
             [
                 self.rope_params(pos_index, self.axes_dim[0], self.theta),
                 self.rope_params(pos_index, self.axes_dim[1], self.theta),
                 self.rope_params(pos_index, self.axes_dim[2], self.theta),
             ],
             dim=1,
         )
-        self.neg_freqs = torch.cat(
+        neg_freqs = torch.cat(
             [
                 self.rope_params(neg_index, self.axes_dim[0], self.theta),
                 self.rope_params(neg_index, self.axes_dim[1], self.theta),
@@ -179,6 +180,8 @@ def __init__(self, theta: int, axes_dim: List[int], scale_rope=False):
             dim=1,
         )
         self.rope_cache = {}
+        self.register_buffer("pos_freqs", pos_freqs, persistent=False)
+        self.register_buffer("neg_freqs", neg_freqs, persistent=False)
 
         # 是否使用 scale rope
         self.scale_rope = scale_rope
@@ -198,33 +201,17 @@ def forward(self, video_fhw, txt_seq_lens, device):
         Args: video_fhw: [frame, height, width] a list of 3 integers representing the shape of the video Args:
         txt_length: [bs] a list of 1 integers representing the length of the text
         """
-        if self.pos_freqs.device != device:
-            self.pos_freqs = self.pos_freqs.to(device)
-            self.neg_freqs = self.neg_freqs.to(device)
-
         if isinstance(video_fhw, list):
             video_fhw = video_fhw[0]
         frame, height, width = video_fhw
         rope_key = f"{frame}_{height}_{width}"
 
-        if rope_key not in self.rope_cache:
-            seq_lens = frame * height * width
-            freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
-            freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
-            freqs_frame = freqs_pos[0][:frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
-            if self.scale_rope:
-                freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
-                freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
-                freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
-                freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
-
-            else:
-                freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
-                freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
-
-            freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
-            self.rope_cache[rope_key] = freqs.clone().contiguous()
-        vid_freqs = self.rope_cache[rope_key]
+        if not torch.compiler.is_compiling():
+            if rope_key not in self.rope_cache:
+                self.rope_cache[rope_key] = self._compute_video_freqs(frame, height, width)
+            vid_freqs = self.rope_cache[rope_key]
+        else:
+            vid_freqs = self._compute_video_freqs(frame, height, width)
 
         if self.scale_rope:
             max_vid_index = max(height // 2, width // 2)
@@ -236,6 +223,25 @@ def forward(self, video_fhw, txt_seq_lens, device):
 
         return vid_freqs, txt_freqs
 
+    @functools.lru_cache(maxsize=None)
+    def _compute_video_freqs(self, frame, height, width):
+        seq_lens = frame * height * width
+        freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+        freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+
+        freqs_frame = freqs_pos[0][:frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
+        if self.scale_rope:
+            freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
+            freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
+            freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
+            freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
+        else:
+            freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
+            freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
+
+        freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
+        return freqs.clone().contiguous()
+
 
 class QwenDoubleStreamAttnProcessor2_0:
     """
@@ -482,6 +488,7 @@ class QwenImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Fro
     _supports_gradient_checkpointing = True
     _no_split_modules = ["QwenImageTransformerBlock"]
     _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
+    _repeated_blocks = ["QwenImageTransformerBlock"]
 
     @register_to_config
     def __init__(
 
@@ -25,7 +25,6 @@
     _import_structure["modular_pipeline"] = [
         "ModularPipelineBlocks",
         "ModularPipeline",
-        "PipelineBlock",
         "AutoPipelineBlocks",
         "SequentialPipelineBlocks",
         "LoopSequentialPipelineBlocks",
@@ -59,7 +58,6 @@
             LoopSequentialPipelineBlocks,
             ModularPipeline,
             ModularPipelineBlocks,
-            PipelineBlock,
             PipelineState,
             SequentialPipelineBlocks,
         )
 
@@ -22,7 +22,7 @@
 from ...schedulers import FlowMatchEulerDiscreteScheduler
 from ...utils import logging
 from ...utils.torch_utils import randn_tensor
-from ..modular_pipeline import PipelineBlock, PipelineState
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 from .modular_pipeline import FluxModularPipeline
 
@@ -231,7 +231,7 @@ def _get_initial_timesteps_and_optionals(
     return timesteps, num_inference_steps, sigmas, guidance
 
 
-class FluxInputStep(PipelineBlock):
+class FluxInputStep(ModularPipelineBlocks):
     model_name = "flux"
 
     @property
@@ -249,11 +249,6 @@ def description(self) -> str:
     def inputs(self) -> List[InputParam]:
         return [
             InputParam("num_images_per_prompt", default=1),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[str]:
-        return [
             InputParam(
                 "prompt_embeds",
                 required=True,
@@ -322,7 +317,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         return components, state
 
 
-class FluxSetTimestepsStep(PipelineBlock):
+class FluxSetTimestepsStep(ModularPipelineBlocks):
     model_name = "flux"
 
     @property
@@ -340,14 +335,10 @@ def inputs(self) -> List[InputParam]:
             InputParam("timesteps"),
             InputParam("sigmas"),
             InputParam("guidance_scale", default=3.5),
+            InputParam("latents", type_hint=torch.Tensor),
             InputParam("num_images_per_prompt", default=1),
             InputParam("height", type_hint=int),
             InputParam("width", type_hint=int),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[str]:
-        return [
             InputParam(
                 "batch_size",
                 required=True,
@@ -398,7 +389,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         return components, state
 
 
-class FluxImg2ImgSetTimestepsStep(PipelineBlock):
+class FluxImg2ImgSetTimestepsStep(ModularPipelineBlocks):
     model_name = "flux"
 
     @property
@@ -420,11 +411,6 @@ def inputs(self) -> List[InputParam]:
             InputParam("num_images_per_prompt", default=1),
             InputParam("height", type_hint=int),
             InputParam("width", type_hint=int),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[str]:
-        return [
             InputParam(
                 "batch_size",
                 required=True,
@@ -497,7 +483,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         return components, state
 
 
-class FluxPrepareLatentsStep(PipelineBlock):
+class FluxPrepareLatentsStep(ModularPipelineBlocks):
     model_name = "flux"
 
     @property
@@ -515,11 +501,6 @@ def inputs(self) -> List[InputParam]:
             InputParam("width", type_hint=int),
             InputParam("latents", type_hint=Optional[torch.Tensor]),
             InputParam("num_images_per_prompt", type_hint=int, default=1),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[InputParam]:
-        return [
             InputParam("generator"),
             InputParam(
                 "batch_size",
@@ -621,7 +602,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         return components, state
 
 
-class FluxImg2ImgPrepareLatentsStep(PipelineBlock):
+class FluxImg2ImgPrepareLatentsStep(ModularPipelineBlocks):
     model_name = "flux"
 
     @property
@@ -639,11 +620,6 @@ def inputs(self) -> List[Tuple[str, Any]]:
             InputParam("width", type_hint=int),
             InputParam("latents", type_hint=Optional[torch.Tensor]),
             InputParam("num_images_per_prompt", type_hint=int, default=1),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[InputParam]:
-        return [
             InputParam("generator"),
             InputParam(
                 "image_latents",
 
@@ -22,7 +22,7 @@
 from ...models import AutoencoderKL
 from ...utils import logging
 from ...video_processor import VaeImageProcessor
-from ..modular_pipeline import PipelineBlock, PipelineState
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
 
 
@@ -45,7 +45,7 @@ def _unpack_latents(latents, height, width, vae_scale_factor):
     return latents
 
 
-class FluxDecodeStep(PipelineBlock):
+class FluxDecodeStep(ModularPipelineBlocks):
     model_name = "flux"
 
     @property
@@ -70,17 +70,12 @@ def inputs(self) -> List[Tuple[str, Any]]:
             InputParam("output_type", default="pil"),
             InputParam("height", default=1024),
             InputParam("width", default=1024),
-        ]
-
-    @property
-    def intermediate_inputs(self) -> List[str]:
-        return [
             InputParam(
                 "latents",
                 required=True,
                 type_hint=torch.Tensor,
                 description="The denoised latents from the denoising step",
-            )
+            ),
         ]
 
     @property
 
@@ -22,7 +22,7 @@
 from ..modular_pipeline import (
     BlockState,
     LoopSequentialPipelineBlocks,
-    PipelineBlock,
+    ModularPipelineBlocks,
     PipelineState,
 )
 from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
@@ -32,7 +32,7 @@
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
 
-class FluxLoopDenoiser(PipelineBlock):
+class FluxLoopDenoiser(ModularPipelineBlocks):
     model_name = "flux"
 
     @property
@@ -49,11 +49,8 @@ def description(self) -> str:
 
     @property
     def inputs(self) -> List[Tuple[str, Any]]:
-        return [InputParam("joint_attention_kwargs")]
-
-    @property
-    def intermediate_inputs(self) -> List[str]:
         return [
+            InputParam("joint_attention_kwargs"),
             InputParam(
                 "latents",
                 required=True,
@@ -113,7 +110,7 @@ def __call__(
         return components, block_state
 
 
-class FluxLoopAfterDenoiser(PipelineBlock):
+class FluxLoopAfterDenoiser(ModularPipelineBlocks):
     model_name = "flux"
 
     @property
@@ -175,7 +172,7 @@ def loop_expected_components(self) -> List[ComponentSpec]:
         ]
 
     @property
-    def loop_intermediate_inputs(self) -> List[InputParam]:
+    def loop_inputs(self) -> List[InputParam]:
         return [
             InputParam(
                 "timesteps",
 
@@ -24,7 +24,7 @@
 from ...loaders import FluxLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL
 from ...utils import USE_PEFT_BACKEND, is_ftfy_available, logging, scale_lora_layers, unscale_lora_layers
-from ..modular_pipeline import PipelineBlock, PipelineState
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
 from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
 from .modular_pipeline import FluxModularPipeline
 
@@ -67,7 +67,7 @@ def retrieve_latents(
         raise AttributeError("Could not access latents of provided encoder_output")
 
 
-class FluxVaeEncoderStep(PipelineBlock):
+class FluxVaeEncoderStep(ModularPipelineBlocks):
     model_name = "flux"
 
     @property
@@ -88,11 +88,10 @@ def expected_components(self) -> List[ComponentSpec]:
 
     @property
     def inputs(self) -> List[InputParam]:
-        return [InputParam("image", required=True), InputParam("height"), InputParam("width")]
-
-    @property
-    def intermediate_inputs(self) -> List[InputParam]:
         return [
+            InputParam("image", required=True),
+            InputParam("height"),
+            InputParam("width"),
             InputParam("generator"),
             InputParam("dtype", type_hint=torch.dtype, description="Data type of model tensor inputs"),
             InputParam(
@@ -157,7 +156,7 @@ def __call__(self, components: FluxModularPipeline, state: PipelineState) -> Pip
         return components, state
 
 
-class FluxTextEncoderStep(PipelineBlock):
+class FluxTextEncoderStep(ModularPipelineBlocks):
     model_name = "flux"
 
     @property