qwen modular refactor, unpack before decode

yiyixuxu · yiyixuxu · commit 64415abc13e7 · 2025-12-12T21:27:57.000+01:00
diff --git a/src/diffusers/modular_pipelines/qwenimage/decoders.py b/src/diffusers/modular_pipelines/qwenimage/decoders.py
@@ -29,6 +29,45 @@
 
 logger = logging.get_logger(__name__)
 
+class QwenImageAfterDenoiseStep(ModularPipelineBlocks):
+    model_name = "qwenimage"
+
+    @property
+    def description(self) -> str:
+        return "Step that unpack the latents from 3D tensor (batch_size, sequence_length, channels) into 5D tensor (batch_size, channels, 1, height, width)"
+
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        components = [
+            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
+        ]
+
+        return components
+
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="height", required=True),
+            InputParam(name="width", required=True),
+            InputParam(
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The latents to decode, can be generated in the denoise step",
+            ),
+        ]
+
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+
+        vae_scale_factor = components.vae_scale_factor
+        block_state.latents = components.pachifier.unpack_latents(
+            block_state.latents, block_state.height, block_state.width, vae_scale_factor=vae_scale_factor
+        )
+
+        self.set_block_state(state, block_state)
+        return components, state
 
 class QwenImageDecoderStep(ModularPipelineBlocks):
     model_name = "qwenimage"
@@ -41,16 +80,13 @@ def description(self) -> str:
     def expected_components(self) -> List[ComponentSpec]:
         components = [
             ComponentSpec("vae", AutoencoderKLQwenImage),
-            ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
         ]
 
         return components
 
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam(name="height", required=True),
-            InputParam(name="width", required=True),
             InputParam(
                 name="latents",
                 required=True,
@@ -74,10 +110,10 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -
         block_state = self.get_block_state(state)
 
         # YiYi Notes: remove support for output_type = "latents', we can just skip decode/encode step in modular
-        vae_scale_factor = components.vae_scale_factor
-        block_state.latents = components.pachifier.unpack_latents(
-            block_state.latents, block_state.height, block_state.width, vae_scale_factor=vae_scale_factor
-        )
+        if block_state.latents.ndim == 4:
+            block_state.latents = block_state.latents.unsqueeze(dim=1)
+        elif block_state.latents.ndim != 5:
+            raise ValueError(f"expect latents to be a 4D or 5D tensor but got: {block_state.latents.shape}. Please make sure the latents are unpacked before decode step.")
         block_state.latents = block_state.latents.to(components.vae.dtype)
 
         latents_mean = (
diff --git a/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py b/src/diffusers/modular_pipelines/qwenimage/modular_blocks.py
@@ -26,7 +26,7 @@
     QwenImageSetTimestepsStep,
     QwenImageSetTimestepsWithStrengthStep,
 )
-from .decoders import QwenImageDecoderStep, QwenImageInpaintProcessImagesOutputStep, QwenImageProcessImagesOutputStep
+from .decoders import QwenImageAfterDenoiseStep, QwenImageDecoderStep, QwenImageInpaintProcessImagesOutputStep, QwenImageProcessImagesOutputStep
 from .denoise import (
     QwenImageControlNetDenoiseStep,
     QwenImageDenoiseStep,
@@ -92,6 +92,7 @@ def description(self):
         ("set_timesteps", QwenImageSetTimestepsStep()),
         ("prepare_rope_inputs", QwenImageRoPEInputsStep()),
         ("denoise", QwenImageDenoiseStep()),
+        ("after_denoise", QwenImageAfterDenoiseStep()),
         ("decode", QwenImageDecodeStep()),
     ]
 )
@@ -205,6 +206,7 @@ def description(self):
         ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()),
         ("prepare_rope_inputs", QwenImageRoPEInputsStep()),
         ("denoise", QwenImageInpaintDenoiseStep()),
+        ("after_denoise", QwenImageAfterDenoiseStep()),
         ("decode", QwenImageInpaintDecodeStep()),
     ]
 )
@@ -264,6 +266,7 @@ def description(self):
         ("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()),
         ("prepare_rope_inputs", QwenImageRoPEInputsStep()),
         ("denoise", QwenImageDenoiseStep()),
+        ("after_denoise", QwenImageAfterDenoiseStep()),
         ("decode", QwenImageDecodeStep()),
     ]
 )
@@ -529,8 +532,9 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
         QwenImageAutoBeforeDenoiseStep,
         QwenImageOptionalControlNetBeforeDenoiseStep,
         QwenImageAutoDenoiseStep,
+        QwenImageAfterDenoiseStep,
     ]
-    block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise"]
+    block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise", "after_denoise"]
 
     @property
     def description(self):
@@ -653,6 +657,7 @@ def description(self):
         ("set_timesteps", QwenImageSetTimestepsStep()),
         ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()),
         ("denoise", QwenImageEditDenoiseStep()),
+        ("after_denoise", QwenImageAfterDenoiseStep()),
         ("decode", QwenImageDecodeStep()),
     ]
 )
@@ -702,6 +707,7 @@ def description(self) -> str:
         ("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()),
         ("prepare_rope_inputs", QwenImageEditRoPEInputsStep()),
         ("denoise", QwenImageEditInpaintDenoiseStep()),
+        ("after_denoise", QwenImageAfterDenoiseStep()),
         ("decode", QwenImageInpaintDecodeStep()),
     ]
 )
@@ -841,8 +847,9 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):
         QwenImageEditAutoInputStep,
         QwenImageEditAutoBeforeDenoiseStep,
         QwenImageEditAutoDenoiseStep,
+        QwenImageAfterDenoiseStep,
     ]
-    block_names = ["input", "before_denoise", "denoise"]
+    block_names = ["input", "before_denoise", "denoise", "after_denoise"]
 
     @property
     def description(self):
@@ -954,6 +961,7 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):
         ("set_timesteps", QwenImageSetTimestepsStep()),
         ("prepare_rope_inputs", QwenImageEditPlusRoPEInputsStep()),
         ("denoise", QwenImageEditDenoiseStep()),
+        ("after_denoise", QwenImageAfterDenoiseStep()),
         ("decode", QwenImageDecodeStep()),
     ]
 )
@@ -1037,8 +1045,9 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):
         QwenImageEditPlusAutoInputStep,
         QwenImageEditPlusAutoBeforeDenoiseStep,
         QwenImageEditAutoDenoiseStep,
+        QwenImageAfterDenoiseStep,
     ]
-    block_names = ["input", "before_denoise", "denoise"]
+    block_names = ["input", "before_denoise", "denoise", "after_denoise"]
 
     @property
     def description(self):

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`QwenImageSetTimestepsStep,`
`27`	`27`	`QwenImageSetTimestepsWithStrengthStep,`
`28`	`28`	`)`
`29`		`-from .decoders import QwenImageDecoderStep, QwenImageInpaintProcessImagesOutputStep, QwenImageProcessImagesOutputStep`
	`29`	`+from .decoders import QwenImageAfterDenoiseStep, QwenImageDecoderStep, QwenImageInpaintProcessImagesOutputStep, QwenImageProcessImagesOutputStep`
`30`	`30`	`from .denoise import (`
`31`	`31`	`QwenImageControlNetDenoiseStep,`
`32`	`32`	`QwenImageDenoiseStep,`
`@@ -92,6 +92,7 @@ def description(self):`
`92`	`92`	`("set_timesteps", QwenImageSetTimestepsStep()),`
`93`	`93`	`("prepare_rope_inputs", QwenImageRoPEInputsStep()),`
`94`	`94`	`("denoise", QwenImageDenoiseStep()),`
	`95`	`+ ("after_denoise", QwenImageAfterDenoiseStep()),`
`95`	`96`	`("decode", QwenImageDecodeStep()),`
`96`	`97`	`]`
`97`	`98`	`)`
`@@ -205,6 +206,7 @@ def description(self):`
`205`	`206`	`("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()),`
`206`	`207`	`("prepare_rope_inputs", QwenImageRoPEInputsStep()),`
`207`	`208`	`("denoise", QwenImageInpaintDenoiseStep()),`
	`209`	`+ ("after_denoise", QwenImageAfterDenoiseStep()),`
`208`	`210`	`("decode", QwenImageInpaintDecodeStep()),`
`209`	`211`	`]`
`210`	`212`	`)`
`@@ -264,6 +266,7 @@ def description(self):`
`264`	`266`	`("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()),`
`265`	`267`	`("prepare_rope_inputs", QwenImageRoPEInputsStep()),`
`266`	`268`	`("denoise", QwenImageDenoiseStep()),`
	`269`	`+ ("after_denoise", QwenImageAfterDenoiseStep()),`
`267`	`270`	`("decode", QwenImageDecodeStep()),`
`268`	`271`	`]`
`269`	`272`	`)`
`@@ -529,8 +532,9 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):`
`529`	`532`	`QwenImageAutoBeforeDenoiseStep,`
`530`	`533`	`QwenImageOptionalControlNetBeforeDenoiseStep,`
`531`	`534`	`QwenImageAutoDenoiseStep,`
	`535`	`+ QwenImageAfterDenoiseStep,`
`532`	`536`	`]`
`533`		`- block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise"]`
	`537`	`+ block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise", "after_denoise"]`
`534`	`538`
`535`	`539`	`@property`
`536`	`540`	`def description(self):`
`@@ -653,6 +657,7 @@ def description(self):`
`653`	`657`	`("set_timesteps", QwenImageSetTimestepsStep()),`
`654`	`658`	`("prepare_rope_inputs", QwenImageEditRoPEInputsStep()),`
`655`	`659`	`("denoise", QwenImageEditDenoiseStep()),`
	`660`	`+ ("after_denoise", QwenImageAfterDenoiseStep()),`
`656`	`661`	`("decode", QwenImageDecodeStep()),`
`657`	`662`	`]`
`658`	`663`	`)`
`@@ -702,6 +707,7 @@ def description(self) -> str:`
`702`	`707`	`("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()),`
`703`	`708`	`("prepare_rope_inputs", QwenImageEditRoPEInputsStep()),`
`704`	`709`	`("denoise", QwenImageEditInpaintDenoiseStep()),`
	`710`	`+ ("after_denoise", QwenImageAfterDenoiseStep()),`
`705`	`711`	`("decode", QwenImageInpaintDecodeStep()),`
`706`	`712`	`]`
`707`	`713`	`)`
`@@ -841,8 +847,9 @@ class QwenImageEditCoreDenoiseStep(SequentialPipelineBlocks):`
`841`	`847`	`QwenImageEditAutoInputStep,`
`842`	`848`	`QwenImageEditAutoBeforeDenoiseStep,`
`843`	`849`	`QwenImageEditAutoDenoiseStep,`
	`850`	`+ QwenImageAfterDenoiseStep,`
`844`	`851`	`]`
`845`		`- block_names = ["input", "before_denoise", "denoise"]`
	`852`	`+ block_names = ["input", "before_denoise", "denoise", "after_denoise"]`
`846`	`853`
`847`	`854`	`@property`
`848`	`855`	`def description(self):`
`@@ -954,6 +961,7 @@ class QwenImageEditPlusInputStep(SequentialPipelineBlocks):`
`954`	`961`	`("set_timesteps", QwenImageSetTimestepsStep()),`
`955`	`962`	`("prepare_rope_inputs", QwenImageEditPlusRoPEInputsStep()),`
`956`	`963`	`("denoise", QwenImageEditDenoiseStep()),`
	`964`	`+ ("after_denoise", QwenImageAfterDenoiseStep()),`
`957`	`965`	`("decode", QwenImageDecodeStep()),`
`958`	`966`	`]`
`959`	`967`	`)`
`@@ -1037,8 +1045,9 @@ class QwenImageEditPlusCoreDenoiseStep(SequentialPipelineBlocks):`
`1037`	`1045`	`QwenImageEditPlusAutoInputStep,`
`1038`	`1046`	`QwenImageEditPlusAutoBeforeDenoiseStep,`
`1039`	`1047`	`QwenImageEditAutoDenoiseStep,`
	`1048`	`+ QwenImageAfterDenoiseStep,`
`1040`	`1049`	`]`
`1041`		`- block_names = ["input", "before_denoise", "denoise"]`
	`1050`	`+ block_names = ["input", "before_denoise", "denoise", "after_denoise"]`
`1042`	`1051`
`1043`	`1052`	`@property`
`1044`	`1053`	`def description(self):`