huggingface
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 6 additions & 2 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎docs/source/en/api/cache.md‎
Lines changed: 6 additions & 0 deletions b/‎docs/source/en/api/cache.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/source/en/api/models/z_image_transformer2d.md‎
Lines changed: 19 additions & 0 deletions b/‎docs/source/en/api/models/z_image_transformer2d.md‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/kandinsky5_image.md‎
Lines changed: 5 additions & 1 deletion b/‎docs/source/en/api/pipelines/kandinsky5_image.md‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎docs/source/en/api/pipelines/kandinsky5_video.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/source/en/api/pipelines/kandinsky5_video.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/en/api/pipelines/z_image.md‎
Lines changed: 33 additions & 0 deletions b/‎docs/source/en/api/pipelines/z_image.md‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎docs/source/en/optimization/cache.md‎
Lines changed: 31 additions & 0 deletions b/‎docs/source/en/optimization/cache.md‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎docs/source/en/quantization/modelopt.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/source/en/quantization/modelopt.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎scripts/convert_hunyuan_video1_5_to_diffusers.py‎
Lines changed: 27 additions & 2 deletions b/‎scripts/convert_hunyuan_video1_5_to_diffusers.py‎
Lines changed: 27 additions & 2 deletions
diff --git a/‎src/diffusers/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎src/diffusers/__init__.py‎
Lines changed: 4 additions & 0 deletions
@@ -401,6 +401,8 @@
         title: WanAnimateTransformer3DModel
       - local: api/models/wan_transformer_3d
         title: WanTransformer3DModel
+      - local: api/models/z_image_transformer2d
+        title: ZImageTransformer2DModel        
       title: Transformers
     - sections:
       - local: api/models/stable_cascade_unet
@@ -551,6 +553,8 @@
         title: Kandinsky 2.2
       - local: api/pipelines/kandinsky3
         title: Kandinsky 3
+      - local: api/pipelines/kandinsky5_image
+        title: Kandinsky 5.0 Image
       - local: api/pipelines/kolors
         title: Kolors
       - local: api/pipelines/latent_consistency_models
@@ -646,6 +650,8 @@
         title: VisualCloze
       - local: api/pipelines/wuerstchen
         title: Wuerstchen
+      - local: api/pipelines/z_image
+        title: Z-Image        
       title: Image
     - sections:
       - local: api/pipelines/allegro
@@ -664,8 +670,6 @@
         title: HunyuanVideo1.5
       - local: api/pipelines/i2vgenxl
         title: I2VGen-XL
-      - local: api/pipelines/kandinsky5_image
-        title: Kandinsky 5.0 Image
       - local: api/pipelines/kandinsky5_video
         title: Kandinsky 5.0 Video
       - local: api/pipelines/latte
 
@@ -34,3 +34,9 @@ Cache methods speedup diffusion transformers by storing and reusing intermediate
 [[autodoc]] FirstBlockCacheConfig
 
 [[autodoc]] apply_first_block_cache
+
+### TaylorSeerCacheConfig
+
+[[autodoc]] TaylorSeerCacheConfig
+
+[[autodoc]] apply_taylorseer_cache
@@ -0,0 +1,19 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# ZImageTransformer2DModel
+
+A Transformer model for image-like data from [Z-Image](https://huggingface.co/Tongyi-MAI/Z-Image-Turbo).
+
+## ZImageTransformer2DModel
+
+[[autodoc]] ZImageTransformer2DModel
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License.
 
 [Kandinsky 5.0](https://arxiv.org/abs/2511.14993) is a family of diffusion models for Video & Image generation. 
 
-Kandinsky 5.0 Image Lite is a lightweight image generation model (6B parameters) 
+Kandinsky 5.0 Image Lite is a lightweight image generation model (6B parameters).
 
 The model introduces several key innovations:
 - **Latent diffusion pipeline** with **Flow Matching** for improved training stability
@@ -21,10 +21,14 @@ The model introduces several key innovations:
 
 The original codebase can be found at [kandinskylab/Kandinsky-5](https://github.com/kandinskylab/Kandinsky-5).
 
+> [!TIP]
+> Check out the [Kandinsky Lab](https://huggingface.co/kandinskylab) organization on the Hub for the official model checkpoints for text-to-video generation, including pretrained, SFT, no-CFG, and distilled variants.
+
 
 ## Available Models
 
 Kandinsky 5.0 Image Lite:
+
 | model_id | Description | Use Cases |
 |------------|-------------|-----------|
 | [**kandinskylab/Kandinsky-5.0-T2I-Lite-sft-Diffusers**](https://huggingface.co/kandinskylab/Kandinsky-5.0-T2I-Lite-sft-Diffusers) | 6B image Supervised Fine-Tuned model | Highest generation quality |
 
@@ -30,6 +30,7 @@ The original codebase can be found at [kandinskylab/Kandinsky-5](https://github.
 ## Available Models
 
 Kandinsky 5.0 T2V Pro:
+
 | model_id | Description | Use Cases |
 |------------|-------------|-----------|
 | **kandinskylab/Kandinsky-5.0-T2V-Pro-sft-5s-Diffusers** | 5 second Text-to-Video Pro model | High-quality text-to-video generation |
 
@@ -0,0 +1,33 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Z-Image
+
+<div class="flex flex-wrap space-x-1">
+  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
+</div>
+
+[Z-Image](https://huggingface.co/papers/2511.22699) is a powerful and highly efficient image generation model with 6B parameters. Currently there's only one model with two more to be released:
+
+|Model|Hugging Face|
+|---|---|
+|Z-Image-Turbo|https://huggingface.co/Tongyi-MAI/Z-Image-Turbo|
+
+## Z-Image-Turbo
+
+Z-Image-Turbo is a distilled version of Z-Image that matches or exceeds leading competitors with only 8 NFEs (Number of Function Evaluations). It offers sub-second inference latency on enterprise-grade H800 GPUs and fits comfortably within 16G VRAM consumer devices. It excels in photorealistic image generation, bilingual text rendering (English & Chinese), and robust instruction adherence.
+
+## ZImagePipeline
+
+[[autodoc]] ZImagePipeline
+	- all
+	- __call__
@@ -66,4 +66,35 @@ config = FasterCacheConfig(
     tensor_format="BFCHW",
 )
 pipeline.transformer.enable_cache(config)
+```
+
+## TaylorSeer Cache
+
+[TaylorSeer Cache](https://huggingface.co/papers/2403.06923) accelerates diffusion inference by using Taylor series expansions to approximate and cache intermediate activations across denoising steps. The method predicts future outputs based on past computations, reusing them at specified intervals to reduce redundant calculations.
+
+This caching mechanism delivers strong results with minimal additional memory overhead. For detailed performance analysis, see [our findings here](https://github.com/huggingface/diffusers/pull/12648#issuecomment-3610615080).
+
+To enable TaylorSeer Cache, create a [`TaylorSeerCacheConfig`] and pass it to your pipeline's transformer:
+
+- `cache_interval`: Number of steps to reuse cached outputs before performing a full forward pass
+- `disable_cache_before_step`: Initial steps that use full computations to gather data for approximations
+- `max_order`: Approximation accuracy (in theory, higher values improve quality but increase memory usage but we recommend it should be set to `1`)
+
+```python
+import torch
+from diffusers import FluxPipeline, TaylorSeerCacheConfig
+
+pipe = FluxPipeline.from_pretrained(
+    "black-forest-labs/FLUX.1-dev",
+    torch_dtype=torch.bfloat16,
+)
+pipe.to("cuda")
+
+config = TaylorSeerCacheConfig(
+    cache_interval=5,
+    max_order=1,
+    disable_cache_before_step=10,
+    taylor_factors_dtype=torch.bfloat16,
+)
+pipe.transformer.enable_cache(config)
 ```
@@ -11,7 +11,7 @@ specific language governing permissions and limitations under the License. -->
 
 # NVIDIA ModelOpt
 
-[NVIDIA-ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a unified library of state-of-the-art model optimization techniques like quantization, pruning, distillation, speculative decoding, etc. It compresses deep learning models for downstream deployment frameworks like TensorRT-LLM or TensorRT to optimize inference speed.
+[NVIDIA-ModelOpt](https://github.com/NVIDIA/Model-Optimizer) is a unified library of state-of-the-art model optimization techniques like quantization, pruning, distillation, speculative decoding, etc. It compresses deep learning models for downstream deployment frameworks like TensorRT-LLM or TensorRT to optimize inference speed.
 
 Before you begin, make sure you have nvidia_modelopt installed.
 
@@ -57,7 +57,7 @@ image.save("output.png")
 >
 > The quantization methods in NVIDIA-ModelOpt are designed to reduce the memory footprint of model weights using various QAT (Quantization-Aware Training) and PTQ (Post-Training Quantization) techniques while maintaining model performance. However, the actual performance gain during inference depends on the deployment framework (e.g., TRT-LLM, TensorRT) and the specific hardware configuration.  
 > 
-> More details can be found [here](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples).
+> More details can be found [here](https://github.com/NVIDIA/Model-Optimizer/tree/main/examples).
 
 ## NVIDIAModelOptConfig
 
@@ -86,7 +86,7 @@ The quantization methods supported are as follows:
 | **NVFP4** | `nvfp4 weight only`, `nvfp4 block quantization` | `quant_type`, `quant_type + channel_quantize + block_quantize` | `channel_quantize = -1 is only supported for now`|
 
 
-Refer to the [official modelopt documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/) for a better understanding of the available quantization methods and the exhaustive list of configuration options available.
+Refer to the [official modelopt documentation](https://nvidia.github.io/Model-Optimizer/) for a better understanding of the available quantization methods and the exhaustive list of configuration options available.
 
 ## Serializing and Deserializing quantized models
 
 
@@ -69,6 +69,11 @@
         "target_size": 960,
         "task_type": "i2v",
     },
+    "480p_i2v_step_distilled": {
+        "target_size": 640,
+        "task_type": "i2v",
+        "use_meanflow": True,
+    },
 }
 
 SCHEDULER_CONFIGS = {
@@ -93,6 +98,9 @@
     "720p_i2v_distilled": {
         "shift": 7.0,
     },
+    "480p_i2v_step_distilled": {
+        "shift": 7.0,
+    },
 }
 
 GUIDANCE_CONFIGS = {
@@ -117,6 +125,9 @@
     "720p_i2v_distilled": {
         "guidance_scale": 1.0,
     },
+    "480p_i2v_step_distilled": {
+        "guidance_scale": 1.0,
+    },
 }
 
 
@@ -126,7 +137,7 @@ def swap_scale_shift(weight):
     return new_weight
 
 
-def convert_hyvideo15_transformer_to_diffusers(original_state_dict):
+def convert_hyvideo15_transformer_to_diffusers(original_state_dict, config=None):
     """
     Convert HunyuanVideo 1.5 original checkpoint to Diffusers format.
     """
@@ -142,6 +153,20 @@ def convert_hyvideo15_transformer_to_diffusers(original_state_dict):
     )
     converted_state_dict["time_embed.timestep_embedder.linear_2.bias"] = original_state_dict.pop("time_in.mlp.2.bias")
 
+    if config.use_meanflow:
+        converted_state_dict["time_embed.timestep_embedder_r.linear_1.weight"] = original_state_dict.pop(
+            "time_r_in.mlp.0.weight"
+        )
+        converted_state_dict["time_embed.timestep_embedder_r.linear_1.bias"] = original_state_dict.pop(
+            "time_r_in.mlp.0.bias"
+        )
+        converted_state_dict["time_embed.timestep_embedder_r.linear_2.weight"] = original_state_dict.pop(
+            "time_r_in.mlp.2.weight"
+        )
+        converted_state_dict["time_embed.timestep_embedder_r.linear_2.bias"] = original_state_dict.pop(
+            "time_r_in.mlp.2.bias"
+        )
+
     # 2. context_embedder.time_text_embed.timestep_embedder <- txt_in.t_embedder
     converted_state_dict["context_embedder.time_text_embed.timestep_embedder.linear_1.weight"] = (
         original_state_dict.pop("txt_in.t_embedder.mlp.0.weight")
@@ -627,7 +652,7 @@ def convert_transformer(args):
     config = TRANSFORMER_CONFIGS[args.transformer_type]
     with init_empty_weights():
         transformer = HunyuanVideo15Transformer3DModel(**config)
-    state_dict = convert_hyvideo15_transformer_to_diffusers(original_state_dict)
+    state_dict = convert_hyvideo15_transformer_to_diffusers(original_state_dict, config=transformer.config)
     transformer.load_state_dict(state_dict, strict=True, assign=True)
 
     return transformer
 
@@ -169,10 +169,12 @@
             "LayerSkipConfig",
             "PyramidAttentionBroadcastConfig",
             "SmoothedEnergyGuidanceConfig",
+            "TaylorSeerCacheConfig",
             "apply_faster_cache",
             "apply_first_block_cache",
             "apply_layer_skip",
             "apply_pyramid_attention_broadcast",
+            "apply_taylorseer_cache",
         ]
     )
     _import_structure["models"].extend(
@@ -900,10 +902,12 @@
             LayerSkipConfig,
             PyramidAttentionBroadcastConfig,
             SmoothedEnergyGuidanceConfig,
+            TaylorSeerCacheConfig,
             apply_faster_cache,
             apply_first_block_cache,
             apply_layer_skip,
             apply_pyramid_attention_broadcast,
+            apply_taylorseer_cache,
         )
         from .models import (
             AllegroTransformer3DModel,