Skip to content

Commit e3f111a

Browse files
committed
update
1 parent b500140 commit e3f111a

File tree

1 file changed

+310
-0
lines changed

1 file changed

+310
-0
lines changed
Lines changed: 310 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,310 @@
1+
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4+
the License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
specific language governing permissions and limitations under the License.
11+
-->
12+
13+
14+
# Building Custom Blocks
15+
16+
Modular Diffusers allows you to create custom blocks that can be used in a pipeline. This guide will show you how to create a custom block, define its inputs and outputs, and implement the computation logic.
17+
18+
Let's create a custom block that uses the Florence2 model to process an input image and generate a mask for inpainting
19+
20+
First let's define a custom block in a file called `block.py`:
21+
22+
```py
23+
from typing import List, Union
24+
from PIL import Image, ImageDraw
25+
import torch
26+
import numpy as np
27+
28+
from diffusers.modular_pipelines import (
29+
PipelineState,
30+
ModularPipelineBlocks,
31+
InputParam,
32+
ComponentSpec,
33+
OutputParam,
34+
)
35+
from transformers import AutoProcessor, AutoModelForCausalLM
36+
37+
38+
class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
39+
@property
40+
def expected_components(self):
41+
return [
42+
ComponentSpec(
43+
name="image_annotator",
44+
type_hint=AutoModelForCausalLM,
45+
repo="mrhendrey/Florence-2-large-ft-safetensors",
46+
),
47+
ComponentSpec(
48+
name="image_annotator_processor",
49+
type_hint=AutoProcessor,
50+
repo="mrhendrey/Florence-2-large-ft-safetensors",
51+
),
52+
]
53+
54+
@property
55+
def inputs(self) -> List[InputParam]:
56+
return [
57+
InputParam(
58+
"image",
59+
type_hint=Union[Image.Image, List[Image.Image]],
60+
required=True,
61+
description="Image(s) to annotate",
62+
),
63+
InputParam(
64+
"annotation_task",
65+
type_hint=Union[str, List[str]],
66+
required=True,
67+
default="<REFERRING_EXPRESSION_SEGMENTATION>",
68+
description="""Annotation Task to perform on the image.
69+
Supported Tasks:
70+
71+
<OD>
72+
<REFERRING_EXPRESSION_SEGMENTATION>
73+
<CAPTION>
74+
<DETAILED_CAPTION>
75+
<MORE_DETAILED_CAPTION>
76+
<DENSE_REGION_CAPTION>
77+
<CAPTION_TO_PHRASE_GROUNDING>
78+
<OPEN_VOCABULARY_DETECTION>
79+
80+
""",
81+
),
82+
InputParam(
83+
"annotation_prompt",
84+
type_hint=Union[str, List[str]],
85+
required=True,
86+
description="""Annotation Prompt to provide more context to the task.
87+
Can be used to detect or segment out specific elements in the image
88+
""",
89+
),
90+
InputParam(
91+
"annotation_output_type",
92+
type_hint=str,
93+
required=True,
94+
default="mask_image",
95+
description="""Output type from annotation predictions. Availabe options are
96+
annotation:
97+
- raw annotation predictions from the model based on task type.
98+
mask_image:
99+
-black and white mask image for the given image based on the task type
100+
mask_overlay:
101+
- white mask overlayed on the original image
102+
bounding_box:
103+
- bounding boxes drawn on the original image
104+
""",
105+
),
106+
InputParam(
107+
"annotation_overlay",
108+
type_hint=bool,
109+
required=True,
110+
default=False,
111+
description="",
112+
),
113+
]
114+
115+
@property
116+
def intermediate_outputs(self) -> List[OutputParam]:
117+
return [
118+
OutputParam(
119+
"mask_image",
120+
type_hint=Image,
121+
description="Inpainting Mask for input Image(s)",
122+
),
123+
OutputParam(
124+
"annotations",
125+
type_hint=dict,
126+
description="Annotations Predictions for input Image(s)",
127+
),
128+
OutputParam(
129+
"image",
130+
type_hint=Image,
131+
description="Annotated input Image(s)",
132+
),
133+
]
134+
135+
def get_annotations(self, components, images, prompts, task):
136+
task_prompts = [task + prompt for prompt in prompts]
137+
138+
inputs = components.image_annotator_processor(
139+
text=task_prompts, images=images, return_tensors="pt"
140+
).to(components.image_annotator.device, components.image_annotator.dtype)
141+
142+
generated_ids = components.image_annotator.generate(
143+
input_ids=inputs["input_ids"],
144+
pixel_values=inputs["pixel_values"],
145+
max_new_tokens=1024,
146+
early_stopping=False,
147+
do_sample=False,
148+
num_beams=3,
149+
)
150+
annotations = components.image_annotator_processor.batch_decode(
151+
generated_ids, skip_special_tokens=False
152+
)
153+
outputs = []
154+
for image, annotation in zip(images, annotations):
155+
outputs.append(
156+
components.image_annotator_processor.post_process_generation(
157+
annotation, task=task, image_size=(image.width, image.height)
158+
)
159+
)
160+
return outputs
161+
162+
def prepare_mask(self, images, annotations, overlay=False):
163+
masks = []
164+
for image, annotation in zip(images, annotations):
165+
mask_image = image.copy() if overlay else Image.new("L", image.size, 0)
166+
draw = ImageDraw.Draw(mask_image)
167+
168+
for _, _annotation in annotation.items():
169+
if "polygons" in _annotation:
170+
for polygon in _annotation["polygons"]:
171+
polygon = np.array(polygon).reshape(-1, 2)
172+
if len(polygon) < 3:
173+
continue
174+
polygon = polygon.reshape(-1).tolist()
175+
draw.polygon(polygon, fill="white")
176+
177+
elif "bbox" in _annotation:
178+
bbox = _annotation["bbox"]
179+
draw.rectangle(bbox, fill="white")
180+
181+
masks.append(mask_image)
182+
183+
return masks
184+
185+
def prepare_bounding_boxes(self, images, annotations):
186+
outputs = []
187+
for image, annotation in zip(images, annotations):
188+
image_copy = image.copy()
189+
draw = ImageDraw.Draw(image_copy)
190+
for _, _annotation in annotation.items():
191+
bbox = _annotation["bbox"]
192+
label = _annotation["label"]
193+
194+
draw.rectangle(bbox, outline="red", width=3)
195+
draw.text((bbox[0], bbox[1] - 20), label, fill="red")
196+
197+
outputs.append(image_copy)
198+
199+
return outputs
200+
201+
def prepare_inputs(self, images, prompts):
202+
prompts = prompts or ""
203+
204+
if isinstance(images, Image.Image):
205+
images = [images]
206+
if isinstance(prompts, str):
207+
prompts = [prompts]
208+
209+
if len(images) != len(prompts):
210+
raise ValueError("Number of images and annotation prompts must match.")
211+
212+
return images, prompts
213+
214+
@torch.no_grad()
215+
def __call__(self, components, state: PipelineState) -> PipelineState:
216+
block_state = self.get_block_state(state)
217+
images, annotation_task_prompt = self.prepare_inputs(
218+
block_state.image, block_state.annotation_prompt
219+
)
220+
task = block_state.annotation_task
221+
222+
annotations = self.get_annotations(
223+
components, images, annotation_task_prompt, task
224+
)
225+
block_state.annotations = annotations
226+
if block_state.annotation_output_type == "mask_image":
227+
block_state.mask_image = self.prepare_mask(images, annotations)
228+
else:
229+
block_state.mask_image = None
230+
231+
if block_state.annotation_output_type == "mask_overlay":
232+
block_state.image = self.prepare_mask(images, annotations, overlay=True)
233+
234+
elif block_state.annotation_output_type == "bounding_box":
235+
block_state.image = self.prepare_bounding_boxes(images, annotations)
236+
237+
self.set_block_state(state, block_state)
238+
239+
return components, state
240+
```
241+
242+
Once we have defined our custom block, we can save it as a model repo so that we can easily reuse it.
243+
244+
There are two ways to save the block:
245+
246+
1. From the CLI
247+
248+
```shell
249+
# In the folder with the `block.py` file, run:
250+
diffusers-cli custom_block
251+
```
252+
253+
Then upload the block to the Hub:
254+
255+
```shell
256+
hf upload <your repo id> . .
257+
```
258+
259+
2. From Python
260+
261+
```py
262+
from block import Florence2ImageAnnotatorBlock
263+
block = Florence2ImageAnnotatorBlock()
264+
block.push_to_hub("<your repo id>")
265+
```
266+
267+
## Using the Custom Block
268+
269+
Let's use this custom block in an inpainting workflow.
270+
271+
```py
272+
import torch
273+
from diffusers.modular_pipelines import ModularPipelineBlocks, SequentialPipelineBlocks
274+
from diffusers.modular_pipelines.stable_diffusion_xl import INPAINT_BLOCKS
275+
from diffusers.utils import load_image
276+
277+
# Fetch the Florence2 image annotator block that will create our mask
278+
image_annotator_block = ModularPipelineBlocks.from_pretrained("diffusers/florence2-image-annotator", trust_remote_code=True)
279+
280+
my_blocks = INPAINT_BLOCKS.copy()
281+
# insert the annotation block before the image encoding step
282+
my_blocks.insert("image_annotator", image_annotator_block, 1)
283+
284+
# Create our initial set of inpainting blocks
285+
blocks = SequentialPipelineBlocks.from_blocks_dict(my_blocks)
286+
287+
repo_id = "diffusers-internal-dev/modular-sdxl-inpainting"
288+
pipe = blocks.init_pipeline(repo_id)
289+
pipe.load_components(torch_dtype=torch.float16, device_map="cuda", trust_remote_code=True)
290+
291+
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true")
292+
image = image.resize((1024, 1024))
293+
294+
prompt = ["A red car"]
295+
annotation_task = "<REFERRING_EXPRESSION_SEGMENTATION>"
296+
annotation_prompt = ["the car"]
297+
298+
output = pipe(
299+
prompt=prompt,
300+
image=image,
301+
annotation_task=annotation_task,
302+
annotation_prompt=annotation_prompt,
303+
annotation_output_type="mask_image",
304+
num_inference_steps=35,
305+
guidance_scale=7.5,
306+
strength=0.95,
307+
output="images"
308+
)
309+
output[0].save("florence-inpainting.png")
310+
```

0 commit comments

Comments
 (0)