Pipeline: Enhance inference pipelines with new features

yoavhacohen · yoavhacohen · commit 93af6864b942 · 2025-05-14T12:25:21.000+03:00
* Adaptive normalization after latent upsampling
* CFG Star Rescale
* Varying STG/CFG parameters per step
* Support skipping the initial and/or the final diffusion steps
* CRF compression for image condition (useful for getting more motion in image-to-video)
diff --git a/configs/ltxv-13b-0.9.7-dev.yaml b/configs/ltxv-13b-0.9.7-dev.yaml
@@ -1,4 +1,3 @@
-
 pipeline_type: multi-scale
 checkpoint_path: "ltxv-13b-0.9.7-dev.safetensors"
 downscale_factor: 0.6666666
@@ -14,20 +13,22 @@ prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-P
 prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
 stochastic_sampling: false
 
-
 first_pass:
-  guidance_scale: [3]
-  stg_scale: [1]
-  rescaling_scale: [0.7]
-  guidance_timesteps: [1.0]
-  skip_block_list: [19] # [[1], [1,2], [1,2,3], [27], [28], [28]]
+  guidance_scale: [1, 1, 6, 8, 6, 1, 1]
+  stg_scale: [0, 0, 4, 4, 4, 2, 1]
+  rescaling_scale: [1, 1, 0.5, 0.5, 1, 1, 1]
+  guidance_timesteps: [1.0, 0.996,  0.9933, 0.9850, 0.9767, 0.9008, 0.6180]
+  skip_block_list: [[], [11, 25, 35, 39], [22, 35, 39], [28], [28], [28], [28]]
   num_inference_steps: 30
+  skip_final_inference_steps: 3
+  cfg_star_rescale: true
 
 second_pass:
-  guidance_scale: [3]
+  guidance_scale: [1]
   stg_scale: [1]
-  rescaling_scale: [0.7]
+  rescaling_scale: [1]
   guidance_timesteps: [1.0]
-  skip_block_list: [19] # [[1], [1,2], [1,2,3], [27], [28], [28]]
-  num_inference_steps: 10
-  strength: 0.85
+  skip_block_list: [27]
+  num_inference_steps: 30
+  skip_initial_inference_steps: 17
+  cfg_star_rescale: true
diff --git a/configs/ltxv-2b-0.9.1.yaml b/configs/ltxv-2b-0.9.1.yaml
@@ -0,0 +1,17 @@
+pipeline_type: base
+checkpoint_path: "ltx-video-2b-v0.9.1.safetensors"
+guidance_scale: 3
+stg_scale: 1
+rescaling_scale: 0.7
+skip_block_list: [19]
+num_inference_steps: 40
+stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
+precision: "bfloat16"
+sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
+prompt_enhancement_words_threshold: 120
+prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
+prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
+stochastic_sampling: false
diff --git a/configs/ltxv-2b-0.9.5.yaml b/configs/ltxv-2b-0.9.5.yaml
@@ -0,0 +1,17 @@
+pipeline_type: base
+checkpoint_path: "ltx-video-2b-v0.9.5.safetensors"
+guidance_scale: 3
+stg_scale: 1
+rescaling_scale: 0.7
+skip_block_list: [19]
+num_inference_steps: 40
+stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
+precision: "bfloat16"
+sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
+prompt_enhancement_words_threshold: 120
+prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
+prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
+stochastic_sampling: false
diff --git a/configs/ltxv-2b-0.9.6-distilled.yaml b/configs/ltxv-2b-0.9.6-distilled.yaml
@@ -1,9 +1,8 @@
 pipeline_type: base
 checkpoint_path: "ltxv-2b-0.9.6-distilled-04-25.safetensors"
-guidance_scale: 3
-stg_scale: 1
-rescaling_scale: 0.7
-skip_block_list: [19]
+guidance_scale: 1
+stg_scale: 0
+rescaling_scale: 1
 num_inference_steps: 8
 stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
 decode_timestep: 0.05
diff --git a/configs/ltxv-2b-0.9.yaml b/configs/ltxv-2b-0.9.yaml
@@ -0,0 +1,17 @@
+pipeline_type: base
+checkpoint_path: "ltx-video-2b-v0.9.safetensors"
+guidance_scale: 3
+stg_scale: 1
+rescaling_scale: 0.7
+skip_block_list: [19]
+num_inference_steps: 40
+stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block"
+decode_timestep: 0.05
+decode_noise_scale: 0.025
+text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS"
+precision: "bfloat16"
+sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint"
+prompt_enhancement_words_threshold: 120
+prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0"
+prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct"
+stochastic_sampling: false
diff --git a/inference.py b/inference.py
@@ -11,6 +11,7 @@
 import json
 import numpy as np
 import torch
+import cv2
 from safetensors import safe_open
 from PIL import Image
 from transformers import (
@@ -35,6 +36,7 @@
 from ltx_video.schedulers.rf import RectifiedFlowScheduler
 from ltx_video.utils.skip_layer_strategy import SkipLayerStrategy
 from ltx_video.models.autoencoders.latent_upsampler import LatentUpsampler
+import ltx_video.pipelines.crf_compressor as crf_compressor
 
 MAX_HEIGHT = 720
 MAX_WIDTH = 1280
@@ -96,7 +98,12 @@ def load_image_to_tensor_with_resize_and_crop(
     image = image.crop((x_start, y_start, x_start + new_width, y_start + new_height))
     if not just_crop:
         image = image.resize((target_width, target_height))
-    frame_tensor = torch.tensor(np.array(image)).permute(2, 0, 1).float()
+
+    image = np.array(image)
+    image = cv2.GaussianBlur(image, (3, 3), 0)
+    frame_tensor = torch.from_numpy(image).float()
+    frame_tensor = crf_compressor.compress(frame_tensor / 255.0) * 255.0
+    frame_tensor = frame_tensor.permute(2, 0, 1)
     frame_tensor = (frame_tensor / 127.5) - 1.0
     # Create 5D tensor: (batch_size=1, channels=3, num_frames=1, height, width)
     return frame_tensor.unsqueeze(0).unsqueeze(2)
@@ -266,13 +273,6 @@ def main():
         help="Path to the input video (or imaage) to be modified using the video-to-video pipeline",
     )
 
-    parser.add_argument(
-        "--strength",
-        type=float,
-        default=1.0,
-        help="Editing strength (noising level) for video-to-video pipeline.",
-    )
-
     # Conditioning arguments
     parser.add_argument(
         "--conditioning_media_paths",
@@ -407,7 +407,6 @@ def infer(
     negative_prompt: str,
     offload_to_cpu: bool,
     input_media_path: Optional[str] = None,
-    strength: Optional[float] = 1.0,
     conditioning_media_paths: Optional[List[str]] = None,
     conditioning_strengths: Optional[List[float]] = None,
     conditioning_start_frames: Optional[List[int]] = None,
@@ -614,7 +613,6 @@ def infer(
         frame_rate=frame_rate,
         **sample,
         media_items=media_item,
-        strength=strength,
         conditioning_items=conditioning_items,
         is_video=True,
         vae_per_channel_normalize=True,
diff --git a/ltx_video/pipelines/crf_compressor.py b/ltx_video/pipelines/crf_compressor.py
@@ -0,0 +1,50 @@
+import av
+import torch
+import io
+import numpy as np
+
+
+def _encode_single_frame(output_file, image_array: np.ndarray, crf):
+    container = av.open(output_file, "w", format="mp4")
+    try:
+        stream = container.add_stream(
+            "libx264", rate=1, options={"crf": str(crf), "preset": "veryfast"}
+        )
+        stream.height = image_array.shape[0]
+        stream.width = image_array.shape[1]
+        av_frame = av.VideoFrame.from_ndarray(image_array, format="rgb24").reformat(
+            format="yuv420p"
+        )
+        container.mux(stream.encode(av_frame))
+        container.mux(stream.encode())
+    finally:
+        container.close()
+
+
+def _decode_single_frame(video_file):
+    container = av.open(video_file)
+    try:
+        stream = next(s for s in container.streams if s.type == "video")
+        frame = next(container.decode(stream))
+    finally:
+        container.close()
+    return frame.to_ndarray(format="rgb24")
+
+
+def compress(image: torch.Tensor, crf=29):
+    if crf == 0:
+        return image
+
+    image_array = (
+        (image[: (image.shape[0] // 2) * 2, : (image.shape[1] // 2) * 2] * 255.0)
+        .byte()
+        .cpu()
+        .numpy()
+    )
+    with io.BytesIO() as output_file:
+        _encode_single_frame(output_file, image_array, crf)
+        video_bytes = output_file.getvalue()
+    with io.BytesIO(video_bytes) as video_file:
+        image_array = _decode_single_frame(video_file)
+    tensor = torch.tensor(image_array, dtype=image.dtype, device=image.device) / 255.0
+    return tensor
diff --git a/ltx_video/pipelines/pipeline_ltx_video.py b/ltx_video/pipelines/pipeline_ltx_video.py
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/tests/test_inference.py b/tests/test_inference.py