start to work on edit

yiyixuxu · yiyixuxu · commit 49e683f247e0 · 2025-08-24T00:05:06.000+02:00
diff --git a/src/diffusers/modular_pipelines/qwenimage/encoders.py b/src/diffusers/modular_pipelines/qwenimage/encoders.py
@@ -25,9 +25,18 @@
 
 from .modular_pipeline import QwenImageModularPipeline
 
+from ...pipelines.qwenimage.pipeline_qwenimage import calculate_dimensions
+
 logger = logging.get_logger(__name__)
 
 
+def _extract_masked_hidden(hidden_states: torch.Tensor, mask: torch.Tensor):
+    bool_mask = mask.bool()
+    valid_lengths = bool_mask.sum(dim=1)
+    selected = hidden_states[bool_mask]
+    split_result = torch.split(selected, valid_lengths.tolist(), dim=0)
+    return split_result
+
 def get_qwen_prompt_embeds(
     text_encoder,
     tokenizer,
@@ -53,13 +62,6 @@ def get_qwen_prompt_embeds(
     )
     hidden_states = encoder_hidden_states.hidden_states[-1]
 
-    def _extract_masked_hidden(hidden_states: torch.Tensor, mask: torch.Tensor):
-        bool_mask = mask.bool()
-        valid_lengths = bool_mask.sum(dim=1)
-        selected = hidden_states[bool_mask]
-        split_result = torch.split(selected, valid_lengths.tolist(), dim=0)
-        return split_result
-
     split_hidden_states = _extract_masked_hidden(hidden_states, txt_tokens.attention_mask)
     split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
     attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
@@ -75,6 +77,55 @@ def _extract_masked_hidden(hidden_states: torch.Tensor, mask: torch.Tensor):
 
     return prompt_embeds, encoder_attention_mask
 
+
+def get_qwen_prompt_embeds_edit(
+    text_encoder,
+    processor,
+    prompt: Union[str, List[str]] = None,
+    image: Optional[torch.Tensor] = None,
+    prompt_template_encode: str = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n",
+    prompt_template_encode_start_idx: int = 64,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+):
+
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+
+    template = prompt_template_encode
+    drop_idx = prompt_template_encode_start_idx
+    txt = [template.format(e) for e in prompt]
+
+    model_inputs = processor(
+        text=txt,
+        images=image,
+        padding=True,
+        return_tensors="pt",
+    ).to(device)
+
+    outputs = text_encoder(
+        input_ids=model_inputs.input_ids,
+        attention_mask=model_inputs.attention_mask,
+        pixel_values=model_inputs.pixel_values,
+        image_grid_thw=model_inputs.image_grid_thw,
+        output_hidden_states=True,
+    )
+
+    hidden_states = outputs.hidden_states[-1]
+    split_hidden_states = _extract_masked_hidden(hidden_states, model_inputs.attention_mask)
+    split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
+    attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
+    max_seq_len = max([e.size(0) for e in split_hidden_states])
+    prompt_embeds = torch.stack(
+        [torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]
+    )
+    encoder_attention_mask = torch.stack(
+        [torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list]
+    )
+
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+    return prompt_embeds, encoder_attention_mask
+
 class QwenImageTextEncoderStep(ModularPipelineBlocks):
     model_name = "qwenimage"
 
@@ -139,6 +190,137 @@ def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
         device = components._execution_device
         self.check_inputs(block_state.prompt, block_state.negative_prompt, block_state.max_sequence_length)
 
+        block_state.prompt_embeds, block_state.prompt_embeds_mask = get_qwen_prompt_embeds(
+            components.text_encoder,
+            components.tokenizer,
+            prompt=block_state.prompt,
+            prompt_template_encode=components.config.prompt_template_encode,
+            prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+            tokenizer_max_length=components.config.tokenizer_max_length,
+            device=device,
+        )
+
+        block_state.prompt_embeds = block_state.prompt_embeds[:, :block_state.max_sequence_length]
+        block_state.prompt_embeds_mask = block_state.prompt_embeds_mask[:, :block_state.max_sequence_length]
+
+        if components.requires_unconditional_embeds:
+            block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds(
+                components.text_encoder,
+                components.tokenizer,
+                prompt=block_state.negative_prompt,
+                prompt_template_encode=components.config.prompt_template_encode,
+                prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
+                tokenizer_max_length=components.config.tokenizer_max_length,
+                device=device,
+            )
+            block_state.negative_prompt_embeds = block_state.negative_prompt_embeds[:, :block_state.max_sequence_length]
+            block_state.negative_prompt_embeds_mask = block_state.negative_prompt_embeds_mask[:, :block_state.max_sequence_length]
+
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class QwenImageImageResizeStep(ModularPipelineBlocks):
+    model_name = "qwenimage"
+
+    @property
+    def description(self) -> str:
+        return "Image Resize step that resize the image to the target area while maintaining the aspect ratio"
+    
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("image_processor", VaeImageProcessor, config=FrozenDict({"vae_scale_factor": 16}), default_creation_method="from_config"),
+        ]
+    
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="image", required=True, type_hint=torch.Tensor, description="The image to resize"),
+        ]
+    
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
+        block_state = self.get_block_state(state)
+
+
+        if not isinstance(block_state.image, list):
+            block_state.image = [block_state.image]
+        
+        image_width, image_height = block_state.image[0].size
+        calculated_width, calculated_height = calculate_dimensions(1024 * 1024, image_width / image_height)
+
+        block_state.image = components.image_processor.resize(image, (calculated_height, calculated_width))
+        self.set_block_state(state, block_state)
+        return components, state
+
+
+class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
+    model_name = "qwenimage"
+
+    @property
+    def description(self) -> str:
+        return "Text Encoder step that generate text_embeddings to guide the image generation"
+    
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", Qwen2_5_VLForConditionalGeneration),
+            ComponentSpec("processor", Qwen2VLProcessor),
+            ComponentSpec(
+                "guider",
+                ClassifierFreeGuidance,
+                config=FrozenDict({"guidance_scale": 4.0}),
+                default_creation_method="from_config",
+            ),
+        ]
+    
+    @property
+    def expected_configs(self) -> List[ConfigSpec]:
+        return [
+            ConfigSpec(name="prompt_template_encode", default="<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n"),
+            ConfigSpec(name="prompt_template_encode_start_idx", default=64),
+        ]
+    
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
+            InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
+            InputParam(name="max_sequence_length", type_hint=int, description="The max sequence length to use", default=1024),
+        ]
+    
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(name="prompt_embeds", kwargs_type="guider_input_fields",type_hint=torch.Tensor, description="The prompt embeddings"),
+            OutputParam(name="prompt_embeds_mask", kwargs_type="guider_input_fields", type_hint=torch.Tensor, description="The encoder attention mask"),
+            OutputParam(name="negative_prompt_embeds", kwargs_type="guider_input_fields", type_hint=torch.Tensor, description="The negative prompt embeddings"),
+            OutputParam(name="negative_prompt_embeds_mask", kwargs_type="guider_input_fields", type_hint=torch.Tensor, description="The negative prompt embeddings mask"),
+        ]
+
+    @staticmethod
+    def check_inputs(prompt, negative_prompt, max_sequence_length):
+
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        
+        if negative_prompt is not None and not isinstance(negative_prompt, str) and not isinstance(negative_prompt, list):
+            raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
+        
+        if max_sequence_length is not None and max_sequence_length > 1024:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 1024 but is {max_sequence_length}")
+    
+    @torch.no_grad()
+    def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
+        block_state = self.get_block_state(state)
+
+        self.check_inputs(block_state.prompt, block_state.negative_prompt, block_state.max_sequence_length)
+
+        device = components._execution_device
+        image = components.image_processor.preprocess(block_state.image)
+
+
         block_state.prompt_embeds, block_state.prompt_embeds_mask = get_qwen_prompt_embeds(
             components.text_encoder,
             components.tokenizer,