Skip to content

Commit bf7a666

Browse files
authored
Merge branch 'main' into main
2 parents 72495ec + 95c5ce4 commit bf7a666

File tree

132 files changed

+1632
-114
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

132 files changed

+1632
-114
lines changed

.github/workflows/pr_tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ jobs:
266266
# TODO (sayakpaul, DN6): revisit `--no-deps`
267267
python -m pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
268268
python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
269-
python -m uv pip install -U tokenizers@git+https://github.com/huggingface/tokenizers.git --no-deps
269+
python -m uv pip install -U tokenizers
270270
pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
271271
272272
- name: Environment

examples/community/README.md

Lines changed: 5 additions & 5 deletions
Large diffs are not rendered by default.

examples/community/lpw_stable_diffusion_xl.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -827,7 +827,9 @@ def encode_prompt(
827827
)
828828

829829
# We are only ALWAYS interested in the pooled output of the final text encoder
830-
pooled_prompt_embeds = prompt_embeds[0]
830+
if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
831+
pooled_prompt_embeds = prompt_embeds[0]
832+
831833
prompt_embeds = prompt_embeds.hidden_states[-2]
832834

833835
prompt_embeds_list.append(prompt_embeds)
@@ -879,7 +881,8 @@ def encode_prompt(
879881
output_hidden_states=True,
880882
)
881883
# We are only ALWAYS interested in the pooled output of the final text encoder
882-
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
884+
if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
885+
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
883886
negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
884887

885888
negative_prompt_embeds_list.append(negative_prompt_embeds)

examples/community/pipeline_demofusion_sdxl.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,9 @@ def encode_prompt(
290290
)
291291

292292
# We are only ALWAYS interested in the pooled output of the final text encoder
293-
pooled_prompt_embeds = prompt_embeds[0]
293+
if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
294+
pooled_prompt_embeds = prompt_embeds[0]
295+
294296
prompt_embeds = prompt_embeds.hidden_states[-2]
295297

296298
prompt_embeds_list.append(prompt_embeds)
@@ -342,7 +344,8 @@ def encode_prompt(
342344
output_hidden_states=True,
343345
)
344346
# We are only ALWAYS interested in the pooled output of the final text encoder
345-
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
347+
if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
348+
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
346349
negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
347350

348351
negative_prompt_embeds_list.append(negative_prompt_embeds)

examples/community/pipeline_sdxl_style_aligned.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -628,7 +628,9 @@ def encode_prompt(
628628
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
629629

630630
# We are only ALWAYS interested in the pooled output of the final text encoder
631-
pooled_prompt_embeds = prompt_embeds[0]
631+
if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
632+
pooled_prompt_embeds = prompt_embeds[0]
633+
632634
if clip_skip is None:
633635
prompt_embeds = prompt_embeds.hidden_states[-2]
634636
else:
@@ -688,7 +690,8 @@ def encode_prompt(
688690
output_hidden_states=True,
689691
)
690692
# We are only ALWAYS interested in the pooled output of the final text encoder
691-
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
693+
if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
694+
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
692695
negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
693696

694697
negative_prompt_embeds_list.append(negative_prompt_embeds)

examples/community/pipeline_stable_diffusion_xl_controlnet_adapter.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -359,7 +359,9 @@ def encode_prompt(
359359
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
360360

361361
# We are only ALWAYS interested in the pooled output of the final text encoder
362-
pooled_prompt_embeds = prompt_embeds[0]
362+
if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
363+
pooled_prompt_embeds = prompt_embeds[0]
364+
363365
if clip_skip is None:
364366
prompt_embeds = prompt_embeds.hidden_states[-2]
365367
else:
@@ -419,7 +421,8 @@ def encode_prompt(
419421
output_hidden_states=True,
420422
)
421423
# We are only ALWAYS interested in the pooled output of the final text encoder
422-
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
424+
if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
425+
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
423426
negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
424427

425428
negative_prompt_embeds_list.append(negative_prompt_embeds)

examples/community/pipeline_stable_diffusion_xl_controlnet_adapter_inpaint.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -507,7 +507,9 @@ def encode_prompt(
507507
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
508508

509509
# We are only ALWAYS interested in the pooled output of the final text encoder
510-
pooled_prompt_embeds = prompt_embeds[0]
510+
if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
511+
pooled_prompt_embeds = prompt_embeds[0]
512+
511513
if clip_skip is None:
512514
prompt_embeds = prompt_embeds.hidden_states[-2]
513515
else:
@@ -567,7 +569,8 @@ def encode_prompt(
567569
output_hidden_states=True,
568570
)
569571
# We are only ALWAYS interested in the pooled output of the final text encoder
570-
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
572+
if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
573+
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
571574
negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
572575

573576
negative_prompt_embeds_list.append(negative_prompt_embeds)

examples/community/pipeline_stable_diffusion_xl_differential_img2img.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,9 @@ def encode_prompt(
394394
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
395395

396396
# We are only ALWAYS interested in the pooled output of the final text encoder
397-
pooled_prompt_embeds = prompt_embeds[0]
397+
if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
398+
pooled_prompt_embeds = prompt_embeds[0]
399+
398400
if clip_skip is None:
399401
prompt_embeds = prompt_embeds.hidden_states[-2]
400402
else:
@@ -454,7 +456,8 @@ def encode_prompt(
454456
output_hidden_states=True,
455457
)
456458
# We are only ALWAYS interested in the pooled output of the final text encoder
457-
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
459+
if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
460+
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
458461
negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
459462

460463
negative_prompt_embeds_list.append(negative_prompt_embeds)

examples/community/pipeline_stable_diffusion_xl_ipex.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,9 @@ def encode_prompt(
390390
prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)
391391

392392
# We are only ALWAYS interested in the pooled output of the final text encoder
393-
pooled_prompt_embeds = prompt_embeds[0]
393+
if pooled_prompt_embeds is None and prompt_embeds[0].ndim == 2:
394+
pooled_prompt_embeds = prompt_embeds[0]
395+
394396
if clip_skip is None:
395397
prompt_embeds = prompt_embeds.hidden_states[-2]
396398
else:
@@ -450,7 +452,8 @@ def encode_prompt(
450452
output_hidden_states=True,
451453
)
452454
# We are only ALWAYS interested in the pooled output of the final text encoder
453-
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
455+
if negative_pooled_prompt_embeds is None and negative_prompt_embeds[0].ndim == 2:
456+
negative_pooled_prompt_embeds = negative_prompt_embeds[0]
454457
negative_prompt_embeds = negative_prompt_embeds.hidden_states[-2]
455458

456459
negative_prompt_embeds_list.append(negative_prompt_embeds)

scripts/convert_sana_to_diffusers.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
CTX = init_empty_weights if is_accelerate_available else nullcontext
2626

2727
ckpt_ids = [
28+
"Efficient-Large-Model/Sana_1600M_4Kpx_BF16/checkpoints/Sana_1600M_4Kpx_BF16.pth",
2829
"Efficient-Large-Model/Sana_1600M_2Kpx_BF16/checkpoints/Sana_1600M_2Kpx_BF16.pth",
2930
"Efficient-Large-Model/Sana_1600M_1024px_MultiLing/checkpoints/Sana_1600M_1024px_MultiLing.pth",
3031
"Efficient-Large-Model/Sana_1600M_1024px_BF16/checkpoints/Sana_1600M_1024px_BF16.pth",
@@ -89,7 +90,10 @@ def main(args):
8990
converted_state_dict["caption_norm.weight"] = state_dict.pop("attention_y_norm.weight")
9091

9192
# scheduler
92-
flow_shift = 3.0
93+
if args.image_size == 4096:
94+
flow_shift = 6.0
95+
else:
96+
flow_shift = 3.0
9397

9498
# model config
9599
if args.model_type == "SanaMS_1600M_P1_D20":
@@ -99,7 +103,7 @@ def main(args):
99103
else:
100104
raise ValueError(f"{args.model_type} is not supported.")
101105
# Positional embedding interpolation scale.
102-
interpolation_scale = {512: None, 1024: None, 2048: 1.0}
106+
interpolation_scale = {512: None, 1024: None, 2048: 1.0, 4096: 2.0}
103107

104108
for depth in range(layer_num):
105109
# Transformer blocks.
@@ -272,9 +276,9 @@ def main(args):
272276
"--image_size",
273277
default=1024,
274278
type=int,
275-
choices=[512, 1024, 2048],
279+
choices=[512, 1024, 2048, 4096],
276280
required=False,
277-
help="Image size of pretrained model, 512, 1024 or 2048.",
281+
help="Image size of pretrained model, 512, 1024, 2048 or 4096.",
278282
)
279283
parser.add_argument(
280284
"--model_type", default="SanaMS_1600M_P1_D20", type=str, choices=["SanaMS_1600M_P1_D20", "SanaMS_600M_P1_D28"]

0 commit comments

Comments
 (0)