Skip to content

Commit 6f76085

Browse files
authored
Merge branch 'main' into missing_kwargs_lora_pipeline_py
2 parents 2fb459c + 4e3ddd5 commit 6f76085

File tree

16 files changed

+4186
-38
lines changed

16 files changed

+4186
-38
lines changed

docs/source/en/quantization/torchao.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ image = pipe(prompt, num_inference_steps=30, guidance_scale=7.0).images[0]
126126
image.save("output.png")
127127
```
128128

129-
Some quantization methods, such as `uint4wo`, cannot be loaded directly and may result in an `UnpicklingError` when trying to load the models, but work as expected when saving them. In order to work around this, one can load the state dict manually into the model. Note, however, that this requires using `weights_only=False` in `torch.load`, so it should be run only if the weights were obtained from a trustable source.
129+
If you are using `torch<=2.6.0`, some quantization methods, such as `uint4wo`, cannot be loaded directly and may result in an `UnpicklingError` when trying to load the models, but work as expected when saving them. In order to work around this, one can load the state dict manually into the model. Note, however, that this requires using `weights_only=False` in `torch.load`, so it should be run only if the weights were obtained from a trustable source.
130130

131131
```python
132132
import torch

examples/community/mixture_tiling_sdxl.py

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# Copyright 2025 The HuggingFace Team. All rights reserved.
1+
# Copyright 2025 The DEVAIEXP Team and The HuggingFace Team. All rights reserved.
22
#
33
# Licensed under the Apache License, Version 2.0 (the "License");
44
# you may not use this file except in compliance with the License.
@@ -1070,32 +1070,32 @@ def __call__(
10701070
text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
10711071
else:
10721072
text_encoder_projection_dim = self.text_encoder_2.config.projection_dim
1073-
add_time_ids = self._get_add_time_ids(
1074-
original_size,
1075-
crops_coords_top_left[row][col],
1076-
target_size,
1073+
add_time_ids = self._get_add_time_ids(
1074+
original_size,
1075+
crops_coords_top_left[row][col],
1076+
target_size,
1077+
dtype=prompt_embeds.dtype,
1078+
text_encoder_projection_dim=text_encoder_projection_dim,
1079+
)
1080+
if negative_original_size is not None and negative_target_size is not None:
1081+
negative_add_time_ids = self._get_add_time_ids(
1082+
negative_original_size,
1083+
negative_crops_coords_top_left[row][col],
1084+
negative_target_size,
10771085
dtype=prompt_embeds.dtype,
10781086
text_encoder_projection_dim=text_encoder_projection_dim,
10791087
)
1080-
if negative_original_size is not None and negative_target_size is not None:
1081-
negative_add_time_ids = self._get_add_time_ids(
1082-
negative_original_size,
1083-
negative_crops_coords_top_left[row][col],
1084-
negative_target_size,
1085-
dtype=prompt_embeds.dtype,
1086-
text_encoder_projection_dim=text_encoder_projection_dim,
1087-
)
1088-
else:
1089-
negative_add_time_ids = add_time_ids
1088+
else:
1089+
negative_add_time_ids = add_time_ids
10901090

1091-
if self.do_classifier_free_guidance:
1092-
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
1093-
add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
1094-
add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
1091+
if self.do_classifier_free_guidance:
1092+
prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
1093+
add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
1094+
add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
10951095

1096-
prompt_embeds = prompt_embeds.to(device)
1097-
add_text_embeds = add_text_embeds.to(device)
1098-
add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
1096+
prompt_embeds = prompt_embeds.to(device)
1097+
add_text_embeds = add_text_embeds.to(device)
1098+
add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
10991099
addition_embed_type_row.append((prompt_embeds, add_text_embeds, add_time_ids))
11001100
embeddings_and_added_time.append(addition_embed_type_row)
11011101

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
# AnyTextPipeline Pipeline
2+
3+
Project page: https://aigcdesigngroup.github.io/homepage_anytext
4+
5+
"AnyText comprises a diffusion pipeline with two primary elements: an auxiliary latent module and a text embedding module. The former uses inputs like text glyph, position, and masked image to generate latent features for text generation or editing. The latter employs an OCR model for encoding stroke data as embeddings, which blend with image caption embeddings from the tokenizer to generate texts that seamlessly integrate with the background. We employed text-control diffusion loss and text perceptual loss for training to further enhance writing accuracy."
6+
7+
Each text line that needs to be generated should be enclosed in double quotes. For any usage questions, please refer to the [paper](https://arxiv.org/abs/2311.03054).
8+
9+
10+
```py
11+
import torch
12+
from diffusers import DiffusionPipeline
13+
from anytext_controlnet import AnyTextControlNetModel
14+
from diffusers.utils import load_image
15+
16+
# I chose a font file shared by an HF staff:
17+
# !wget https://huggingface.co/spaces/ysharma/TranslateQuotesInImageForwards/resolve/main/arial-unicode-ms.ttf
18+
19+
anytext_controlnet = AnyTextControlNetModel.from_pretrained("tolgacangoz/anytext-controlnet", torch_dtype=torch.float16,
20+
variant="fp16",)
21+
pipe = DiffusionPipeline.from_pretrained("tolgacangoz/anytext", font_path="arial-unicode-ms.ttf",
22+
controlnet=anytext_controlnet, torch_dtype=torch.float16,
23+
trust_remote_code=False, # One needs to give permission to run this pipeline's code
24+
).to("cuda")
25+
26+
# generate image
27+
prompt = 'photo of caramel macchiato coffee on the table, top-down perspective, with "Any" "Text" written on it using cream'
28+
draw_pos = load_image("https://raw.githubusercontent.com/tyxsspa/AnyText/refs/heads/main/example_images/gen9.png")
29+
image = pipe(prompt, num_inference_steps=20, mode="generate", draw_pos=draw_pos,
30+
).images[0]
31+
image
32+
```

0 commit comments

Comments
 (0)