@@ -144,13 +144,11 @@ class CogView4ControlPipeline(DiffusionPipeline):
144144 Args:
145145 vae ([`AutoencoderKL`]):
146146 Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
147- text_encoder ([`T5EncoderModel`]):
148- Frozen text-encoder. CogView4 uses
149- [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
150- [t5-v1_1-xxl](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl) variant.
151- tokenizer (`T5Tokenizer`):
147+ text_encoder ([`GLMModel`]):
148+ Frozen text-encoder. CogView4 uses [glm-4-9b-hf](https://huggingface.co/THUDM/glm-4-9b-hf).
149+ tokenizer (`PreTrainedTokenizer`):
152150 Tokenizer of class
153- [T5Tokenizer ](https://huggingface.co/docs/transformers/model_doc/t5 #transformers.T5Tokenizer ).
151+ [PreTrainedTokenizer ](https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer #transformers.PreTrainedTokenizer ).
154152 transformer ([`CogView4Transformer2DModel`]):
155153 A text conditioned `CogView4Transformer2DModel` to denoise the encoded image latents.
156154 scheduler ([`SchedulerMixin`]):
@@ -182,7 +180,6 @@ def _get_glm_embeds(
182180 prompt : Union [str , List [str ]] = None ,
183181 num_images_per_prompt : int = 1 ,
184182 max_sequence_length : int = 1024 ,
185- padding_type : str = "longest" ,
186183 device : Optional [torch .device ] = None ,
187184 dtype : Optional [torch .dtype ] = None ,
188185 ):
@@ -194,7 +191,7 @@ def _get_glm_embeds(
194191
195192 text_inputs = self .tokenizer (
196193 prompt ,
197- padding = padding_type ,
194+ padding = "longest" , # not use max length
198195 max_length = max_sequence_length ,
199196 truncation = True ,
200197 add_special_tokens = True ,
@@ -240,7 +237,6 @@ def encode_prompt(
240237 device : Optional [torch .device ] = None ,
241238 dtype : Optional [torch .dtype ] = None ,
242239 max_sequence_length : int = 1024 ,
243- padding_type : str = "longest" ,
244240 ):
245241 r"""
246242 Encodes the prompt into text encoder hidden states.
@@ -278,7 +274,7 @@ def encode_prompt(
278274 else :
279275 batch_size = prompt_embeds .shape [0 ]
280276 if prompt_embeds is None :
281- prompt_embeds = self ._get_glm_embeds (prompt , num_images_per_prompt , max_sequence_length , padding_type , device , dtype )
277+ prompt_embeds = self ._get_glm_embeds (prompt , num_images_per_prompt , max_sequence_length , device , dtype )
282278
283279 if do_classifier_free_guidance and negative_prompt_embeds is None :
284280 negative_prompt = negative_prompt or ""
@@ -297,7 +293,7 @@ def encode_prompt(
297293 )
298294
299295 negative_prompt_embeds = self ._get_glm_embeds (
300- negative_prompt , num_images_per_prompt , max_sequence_length , "longest" , device , dtype
296+ negative_prompt , num_images_per_prompt , max_sequence_length , device , dtype
301297 )
302298
303299 return prompt_embeds , negative_prompt_embeds
@@ -451,7 +447,6 @@ def __call__(
451447 ] = None ,
452448 callback_on_step_end_tensor_inputs : List [str ] = ["latents" ],
453449 max_sequence_length : int = 1024 ,
454- padding_type : str = "longest" , # For downstream tasks, it can be modified to use max_length for implementation.
455450 ) -> Union [CogView4PipelineOutput , Tuple ]:
456451 """
457452 Function invoked when calling the pipeline for generation.
@@ -581,8 +576,7 @@ def __call__(
581576 prompt_embeds = prompt_embeds ,
582577 negative_prompt_embeds = negative_prompt_embeds ,
583578 max_sequence_length = max_sequence_length ,
584- padding_type = padding_type ,
585- device = device
579+ device = device ,
586580 )
587581
588582 # Prepare latents
0 commit comments