final changes

a-r-r-o-w · a-r-r-o-w · commit 6dc9bdb59e19 · 2024-09-16T07:40:52.000+02:00
diff --git a/docs/source/en/api/pipelines/cogvideox.md b/docs/source/en/api/pipelines/cogvideox.md
@@ -48,9 +48,11 @@ from diffusers import CogVideoXPipeline, CogVideoXImageToVideoPipeline
 from diffusers.utils import export_to_video,load_image
 pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b").to("cuda") # or "THUDM/CogVideoX-2b" 
 ```
+
 If you are using the image-to-video pipeline, load it as follows:
+
 ```python
-pipe = CogVideoXImageToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b-I2V").to("cuda") # Image-to-Video pipeline
+pipe = CogVideoXImageToVideoPipeline.from_pretrained("THUDM/CogVideoX-5b-I2V").to("cuda")
 ```
 
 Then change the memory layout of the pipelines `transformer` component to `torch.channels_last`:
@@ -59,7 +61,7 @@ Then change the memory layout of the pipelines `transformer` component to `torch
 pipe.transformer.to(memory_format=torch.channels_last)
 ```
 
-compile the components and run inference:
+Compile the components and run inference:
 
 ```python
 pipe.transformer = torch.compile(pipeline.transformer, mode="max-autotune", fullgraph=True)
@@ -69,22 +71,7 @@ prompt = "A panda, dressed in a small, red jacket and a tiny hat, sits on a wood
 video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
 ```
 
-if you are using the image-to-video pipeline, you can use the following code to generate a video from an image:
-
-```python
-image = load_image("image_of_panda.jpg")
-prompt = "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical performance."
-video = pipe(prompt=prompt, image=image, guidance_scale=6, num_inference_steps=50).frames[0]
-```
-
-To save the video, use the following code:
-
-```python
-export_to_video(video, "panda_video.mp4")
-```
-
-
-The [benchmark](https://gist.github.com/a-r-r-o-w/5183d75e452a368fd17448fcc810bd3f) results on an 80GB A100 machine are:
+The [T2V benchmark](https://gist.github.com/a-r-r-o-w/5183d75e452a368fd17448fcc810bd3f) results on an 80GB A100 machine are:
 
 ```
 Without torch.compile(): Average inference time: 96.89 seconds.
diff --git a/scripts/convert_cogvideox_to_diffusers.py b/scripts/convert_cogvideox_to_diffusers.py
@@ -241,9 +241,10 @@ def get_args():
     if args.vae_ckpt_path is not None:
         vae = convert_vae(args.vae_ckpt_path, args.scaling_factor, dtype)
 
-    text_encoder_id = "/share/official_pretrains/hf_home//t5-v1_1-xxl"
+    text_encoder_id = "google/t5-v1_1-xxl"
     tokenizer = T5Tokenizer.from_pretrained(text_encoder_id, model_max_length=TOKENIZER_MAX_LENGTH)
     text_encoder = T5EncoderModel.from_pretrained(text_encoder_id, cache_dir=args.text_encoder_cache_dir)
+    
     # Apparently, the conversion does not work anymore without this :shrug:
     for param in text_encoder.parameters():
         param.data = param.data.contiguous()