Skip to content

Commit 829545d

Browse files
committed
rename text2image pipeline
1 parent 9059a52 commit 829545d

File tree

6 files changed

+21
-13
lines changed

6 files changed

+21
-13
lines changed

scripts/convert_cosmos_to_diffusers.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,11 @@
1010
from diffusers import (
1111
AutoencoderKLCosmos,
1212
AutoencoderKLWan,
13-
CosmosTextToImagePipeline,
13+
Cosmos2TextToImagePipeline,
14+
Cosmos2VideoToWorldPipeline,
1415
CosmosTextToWorldPipeline,
1516
CosmosTransformer3DModel,
17+
CosmosVideoToWorldPipeline,
1618
EDMEulerScheduler,
1719
)
1820

@@ -412,7 +414,8 @@ def save_pipeline_cosmos_1_0(args, transformer, vae):
412414
final_sigmas_type="sigma_min",
413415
)
414416

415-
pipe = CosmosTextToWorldPipeline(
417+
pipe_cls = CosmosTextToWorldPipeline if "Text2World" in args.transformer_type else CosmosVideoToWorldPipeline
418+
pipe = pipe_cls(
416419
text_encoder=text_encoder,
417420
tokenizer=tokenizer,
418421
transformer=transformer,
@@ -438,7 +441,8 @@ def save_pipeline_cosmos_2_0(args, transformer, vae):
438441
use_flow_sigmas=True,
439442
)
440443

441-
pipe = CosmosTextToImagePipeline(
444+
pipe_cls = Cosmos2TextToImagePipeline if "Text2Image" in args.transformer_type else Cosmos2VideoToWorldPipeline
445+
pipe = pipe_cls(
442446
text_encoder=text_encoder,
443447
tokenizer=tokenizer,
444448
transformer=transformer,

src/diffusers/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -361,8 +361,8 @@
361361
"CogView4ControlPipeline",
362362
"CogView4Pipeline",
363363
"ConsisIDPipeline",
364+
"Cosmos2TextToImagePipeline",
364365
"Cosmos2VideoToWorldPipeline",
365-
"CosmosTextToImagePipeline",
366366
"CosmosTextToWorldPipeline",
367367
"CosmosVideoToWorldPipeline",
368368
"CycleDiffusionPipeline",
@@ -951,8 +951,8 @@
951951
CogView4ControlPipeline,
952952
CogView4Pipeline,
953953
ConsisIDPipeline,
954+
Cosmos2TextToImagePipeline,
954955
Cosmos2VideoToWorldPipeline,
955-
CosmosTextToImagePipeline,
956956
CosmosTextToWorldPipeline,
957957
CosmosVideoToWorldPipeline,
958958
CycleDiffusionPipeline,

src/diffusers/pipelines/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@
158158
_import_structure["cogview4"] = ["CogView4Pipeline", "CogView4ControlPipeline"]
159159
_import_structure["consisid"] = ["ConsisIDPipeline"]
160160
_import_structure["cosmos"] = [
161-
"CosmosTextToImagePipeline",
161+
"Cosmos2TextToImagePipeline",
162162
"CosmosTextToWorldPipeline",
163163
"CosmosVideoToWorldPipeline",
164164
"Cosmos2VideoToWorldPipeline",
@@ -565,8 +565,8 @@
565565
StableDiffusionXLControlNetXSPipeline,
566566
)
567567
from .cosmos import (
568+
Cosmos2TextToImagePipeline,
568569
Cosmos2VideoToWorldPipeline,
569-
CosmosTextToImagePipeline,
570570
CosmosTextToWorldPipeline,
571571
CosmosVideoToWorldPipeline,
572572
)

src/diffusers/pipelines/cosmos/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@
2222

2323
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
2424
else:
25+
_import_structure["pipeline_cosmos2_text2image"] = ["Cosmos2TextToImagePipeline"]
2526
_import_structure["pipeline_cosmos2_video2world"] = ["Cosmos2VideoToWorldPipeline"]
26-
_import_structure["pipeline_cosmos_text2image"] = ["CosmosTextToImagePipeline"]
2727
_import_structure["pipeline_cosmos_text2world"] = ["CosmosTextToWorldPipeline"]
2828
_import_structure["pipeline_cosmos_video2world"] = ["CosmosVideoToWorldPipeline"]
2929

@@ -35,8 +35,8 @@
3535
except OptionalDependencyNotAvailable:
3636
from ...utils.dummy_torch_and_transformers_objects import *
3737
else:
38+
from .pipeline_cosmos2_text2image import Cosmos2TextToImagePipeline
3839
from .pipeline_cosmos2_video2world import Cosmos2VideoToWorldPipeline
39-
from .pipeline_cosmos_text2image import CosmosTextToImagePipeline
4040
from .pipeline_cosmos_text2world import CosmosTextToWorldPipeline
4141
from .pipeline_cosmos_video2world import CosmosVideoToWorldPipeline
4242

src/diffusers/pipelines/cosmos/pipeline_cosmos_text2image.py renamed to src/diffusers/pipelines/cosmos/pipeline_cosmos2_text2image.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,11 @@ def __init__(self, *args, **kwargs):
5454
Examples:
5555
```python
5656
>>> import torch
57-
>>> from diffusers import CosmosTextToImagePipeline
57+
>>> from diffusers import Cosmos2TextToImagePipeline
5858
5959
>>> # Available checkpoints: nvidia/Cosmos-Predict2-2B-Text2Image, nvidia/Cosmos-Predict2-14B-Text2Image
6060
>>> model_id = "nvidia/Cosmos-Predict2-2B-Text2Image"
61-
>>> pipe = CosmosTextToImagePipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
61+
>>> pipe = Cosmos2TextToImagePipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
6262
>>> pipe.to("cuda")
6363
6464
>>> prompt = "A close-up shot captures a vibrant yellow scrubber vigorously working on a grimy plate, its bristles moving in circular motions to lift stubborn grease and food residue. The dish, once covered in remnants of a hearty meal, gradually reveals its original glossy surface. Suds form and bubble around the scrubber, creating a satisfying visual of cleanliness in progress. The sound of scrubbing fills the air, accompanied by the gentle clinking of the dish against the sink. As the scrubber continues its task, the dish transforms, gleaming under the bright kitchen lights, symbolizing the triumph of cleanliness over mess."
@@ -132,7 +132,7 @@ def retrieve_timesteps(
132132
return timesteps, num_inference_steps
133133

134134

135-
class CosmosTextToImagePipeline(DiffusionPipeline):
135+
class Cosmos2TextToImagePipeline(DiffusionPipeline):
136136
r"""
137137
Pipeline for text-to-image generation using [Cosmos](https://github.com/NVIDIA/Cosmos).
138138
@@ -637,6 +637,10 @@ def __call__(
637637
else:
638638
video = self.video_processor.postprocess_video(video, output_type=output_type)
639639
image = [batch[0] for batch in video]
640+
if isinstance(video, torch.Tensor):
641+
image = torch.stack(image)
642+
elif isinstance(video, np.ndarray):
643+
image = np.stack(image)
640644
else:
641645
image = latents[:, :, 0]
642646

src/diffusers/utils/dummy_torch_and_transformers_objects.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,7 @@ def from_pretrained(cls, *args, **kwargs):
407407
requires_backends(cls, ["torch", "transformers"])
408408

409409

410-
class CosmosTextToImagePipeline(metaclass=DummyObject):
410+
class Cosmos2TextToImagePipeline(metaclass=DummyObject):
411411
_backends = ["torch", "transformers"]
412412

413413
def __init__(self, *args, **kwargs):

0 commit comments

Comments
 (0)