THUDM · zhangch9 · Apr 27, 2025 · Apr 10, 2025 · Apr 10, 2025 · Apr 10, 2025
diff --git a/gradio/gradio_infer_demo.py b/gradio/gradio_infer_demo.py
@@ -119,7 +119,7 @@ def update_task(hf_model_id: str) -> Tuple[gr.Dropdown, gr.Component]:
 def update_subcheckpoints(checkpoint_dir):
     """Get subdirectories for the selected checkpoint directory."""
     if checkpoint_dir == "None":
-        return gr.Dropdown(choices=[], interactive=False, visible=False)
+        return gr.Dropdown(choices=["None"], value="None", interactive=False, visible=False)
 
     # Get the full path to the checkpoint directory
     full_checkpoint_path = os.path.join(checkpoint_rootdir, checkpoint_dir)
@@ -138,7 +138,7 @@ def update_subcheckpoints(checkpoint_dir):
 
     if not subdirs:
         # If there are no subdirectories, hide the dropdown
-        return gr.Dropdown(choices=[], interactive=False, visible=False)
+        return gr.Dropdown(choices=["None"], value="None", interactive=False, visible=False)
 
     # Show dropdown with available subdirectories
     return gr.Dropdown(
@@ -183,6 +183,7 @@ def load_model_and_generate(
         )
 
     # Load LoRA weights if selected
+    unload_lora_checkpoint(pipeline)
     if lora_checkpoint != "None":
         progress(0.3, desc="Loading LoRA weights...")
         # Construct the full path to the specific checkpoint
@@ -192,8 +193,6 @@ def load_model_and_generate(
             lora_path = lora_checkpoint
         logger.info(f"Loading LoRA weights from {lora_path}")
         load_lora_checkpoint(pipeline, lora_path)
-    else:
-        unload_lora_checkpoint(pipeline)
 
     # Generate content based on task
     progress(0.5, desc="Generating content...")
@@ -300,7 +299,7 @@ def load_model_and_generate(
                 guidance_scale = gr.Slider(
                     minimum=1.0,
                     maximum=15.0,
-                    value=6.0,
+                    value=5.0,
                     step=0.1,
                     label="Guidance Scale",
                     info="Higher values increase prompt adherence",

diff --git a/gradio/gradio_lora_demo.py b/gradio/gradio_lora_demo.py
@@ -11,7 +11,6 @@
 from torchvision.io import write_video
 from utils import (
     BaseTask,
-    flatten_dict,
     get_dataset_dirs,
     get_logger,
     get_lora_checkpoint_rootdir,
@@ -24,6 +23,7 @@
 
 import gradio as gr
 from cogkit import GenerationMode, guess_generation_mode
+from cogkit.utils import flatten_dict
 
 # =======================  global state  ====================
 

diff --git a/gradio/utils/__init__.py b/gradio/utils/__init__.py
@@ -8,7 +8,7 @@
     resolve_path,
 )
 from .logging import get_logger
-from .misc import flatten_dict, get_resolutions
+from .misc import get_resolutions
 from .task import BaseTask
 
 __all__ = [
@@ -22,5 +22,4 @@
     "resolve_path",
     "BaseTask",
     "get_resolutions",
-    "flatten_dict",
 ]
diff --git a/gradio/utils/misc.py b/gradio/utils/misc.py
@@ -1,9 +1,7 @@
-from typing import Any, Dict, List
-
 from cogkit import GenerationMode
 
 
-def get_resolutions(task: GenerationMode) -> List[str]:
+def get_resolutions(task: GenerationMode) -> list[str]:
     if task == GenerationMode.TextToImage:
         return [
             "512x512",
@@ -19,35 +17,3 @@ def get_resolutions(task: GenerationMode) -> List[str]:
             "49x480x720",
             "81x768x1360",
         ]
-
-
-def flatten_dict(d: Dict[str, Any], ignore_none: bool = False) -> Dict[str, Any]:
-    """
-    Flattens a nested dictionary into a single layer dictionary.
-
-    Args:
-        d: The dictionary to flatten
-        ignore_none: If True, keys with None values will be omitted
-
-    Returns:
-        A flattened dictionary
-
-    Raises:
-        ValueError: If there are duplicate keys across nested dictionaries
-    """
-    result = {}
-
-    def _flatten(current_dict, result_dict):
-        for key, value in current_dict.items():
-            if value is None and ignore_none:
-                continue
-
-            if isinstance(value, dict):
-                _flatten(value, result_dict)
-            else:
-                if key in result_dict:
-                    raise ValueError(f"Duplicate key '{key}' found in nested dictionary")
-                result_dict[key] = value
-
-    _flatten(d, result)
-    return result
diff --git a/pyproject.toml b/pyproject.toml
@@ -33,6 +33,8 @@ finetune = [
   "datasets~=3.4",
   "deepspeed~=0.16.4",
   "av~=14.2.0",
+  "bitsandbytes~=0.45.4",
+  "tensorboard~=2.19",
 ]
 
 [project.urls]

diff --git a/quickstart/scripts/train.py b/quickstart/scripts/train.py
@@ -7,7 +7,7 @@ def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_name", type=str, required=True)
     parser.add_argument("--training_type", type=str, required=True)
-    parser.add_argument("--enable_packing", action="store_true")
+    parser.add_argument("--enable_packing", type=lambda x: x.lower() == "true")
     args, unknown = parser.parse_known_args()
 
     trainer_cls = get_model_cls(args.model_name, args.training_type, args.enable_packing)

diff --git a/quickstart/scripts/train_ddp_i2v.sh b/quickstart/scripts/train_ddp_i2v.sh
@@ -21,11 +21,6 @@ OUTPUT_ARGS=(
 # Data Configuration
 DATA_ARGS=(
     --data_root "/path/to/data"
-
-    # Note:
-    #  for CogVideoX series models, number of training frames should be **8N+1**
-    #  for CogVideoX1.5 series models, number of training frames should be **16N+1**
-    --train_resolution "81x768x1360"  # (frames x height x width)
 )
 
 # Training Configuration
@@ -35,13 +30,18 @@ TRAIN_ARGS=(
     --batch_size 1
     --gradient_accumulation_steps 1
     --mixed_precision "bf16"  # ["no", "fp16"]
-    --learning_rate 2e-5
+    --learning_rate 5e-5
+
+    # Note:
+    #  for CogVideoX series models, number of training frames should be **8N+1**
+    #  for CogVideoX1.5 series models, number of training frames should be **16N+1**
+    --train_resolution "81x768x1360"  # (frames x height x width)
 )
 
 # System Configuration
 SYSTEM_ARGS=(
     --num_workers 8
-    --pin_memory True
+    --pin_memory true
     --nccl_timeout 1800
 )
 

diff --git a/quickstart/scripts/train_ddp_t2i.sh b/quickstart/scripts/train_ddp_t2i.sh
@@ -21,26 +21,38 @@ OUTPUT_ARGS=(
 # Data Configuration
 DATA_ARGS=(
     --data_root "/path/to/data"
-
-    # Note:
-    #   For CogView4 series models, height and width should be **32N** (multiple of 32)
-    --train_resolution "1024x1024"  # (height x width)
 )
 
 # Training Configuration
 TRAIN_ARGS=(
     --seed 42  # random seed
     --train_epochs 1  # number of training epochs
     --batch_size 1
+
     --gradient_accumulation_steps 1
+
+    # Note: For CogView4 series models, height and width should be **32N** (multiple of 32)
+    --train_resolution "1024x1024"  # (height x width)
+
+    # When enable_packing is true, training will use the native image resolution
+    # (otherwise all images will be resized to train_resolution, which may distort the original aspect ratio).
+    #
+    # IMPORTANT: When changing enable_packing from true to false (or vice versa),
+    # make sure to clear the .cache directories in your data_root/train and data_root/test folders if they exist.
+    --enable_packing false
+
     --mixed_precision "bf16"  # ["no", "fp16"]
-    --learning_rate 2e-5
+    --learning_rate 5e-5
+
+    # enable --low_vram will slow down validation speed and enable quantization during training
+    # Note: --low_vram currently does not support multi-GPU training
+    --low_vram false
 )
 
 # System Configuration
 SYSTEM_ARGS=(
     --num_workers 8
-    --pin_memory True
+    --pin_memory true
     --nccl_timeout 1800
 )
 

diff --git a/quickstart/scripts/train_ddp_t2v.sh b/quickstart/scripts/train_ddp_t2v.sh
@@ -20,11 +20,6 @@ OUTPUT_ARGS=(
 # Data Configuration
 DATA_ARGS=(
     --data_root "/path/to/data"
-
-    # Note:
-    #  for CogVideoX series models, number of training frames should be **8N+1**
-    #  for CogVideoX1.5 series models, number of training frames should be **16N+1**
-    --train_resolution "81x768x1360"  # (frames x height x width)
 )
 
 # Training Configuration
@@ -34,13 +29,18 @@ TRAIN_ARGS=(
     --batch_size 1
     --gradient_accumulation_steps 1
     --mixed_precision "bf16"  # ["no", "fp16"]  Note: CogVideoX-2B only supports fp16 training
-    --learning_rate 2e-5
+    --learning_rate 5e-5
+
+    # Note:
+    #  for CogVideoX series models, number of training frames should be **8N+1**
+    #  for CogVideoX1.5 series models, number of training frames should be **16N+1**
+    --train_resolution "81x768x1360"  # (frames x height x width)
 )
 
 # System Configuration
 SYSTEM_ARGS=(
     --num_workers 8
-    --pin_memory True
+    --pin_memory true
     --nccl_timeout 1800
 )
 

diff --git a/quickstart/scripts/train_zero_i2v.sh b/quickstart/scripts/train_zero_i2v.sh
@@ -20,31 +20,32 @@ OUTPUT_ARGS=(
 # Data Configuration
 DATA_ARGS=(
     --data_root "/path/to/data"
-
-    # Note:
-    #  for CogVideoX series models, number of training frames should be **8N+1**
-    #  for CogVideoX1.5 series models, number of training frames should be **16N+1**
-    --train_resolution "81x768x1360"  # (frames x height x width)
 )
 
 # Training Configuration
 TRAIN_ARGS=(
     --seed 42  # random seed
     --train_epochs 1  # number of training epochs
 
-    --learning_rate 2e-5
+    --learning_rate 5e-5
 
     #########   Please keep consistent with deepspeed config file ##########
     --batch_size 1
     --gradient_accumulation_steps 1
     --mixed_precision "bf16"  # ["no", "fp16"]  Note: CogVideoX-2B only supports fp16 training
     ########################################################################
+
+    # Note:
+    #  for CogVideoX series models, number of training frames should be **8N+1**
+    #  for CogVideoX1.5 series models, number of training frames should be **16N+1**
+    --train_resolution "81x768x1360"  # (frames x height x width)
+
 )
 
 # System Configuration
 SYSTEM_ARGS=(
     --num_workers 8
-    --pin_memory True
+    --pin_memory true
     --nccl_timeout 1800
 )
 

diff --git a/quickstart/scripts/train_zero_t2i.sh b/quickstart/scripts/train_zero_t2i.sh
@@ -20,31 +20,37 @@ OUTPUT_ARGS=(
 # Data Configuration
 DATA_ARGS=(
     --data_root "/path/to/data"
-
-    # Note:
-    #   For CogView4 series models, height and width should be **32N** (multiple of 32)
-    --train_resolution "1024x1024"  # (height x width)
 )
 
 # Training Configuration
 TRAIN_ARGS=(
     --seed 42  # random seed
     --train_epochs 1  # number of training epochs
 
-    --learning_rate 2e-5
+    --learning_rate 5e-5
+
+    # Note: For CogView4 series models, height and width should be **32N** (multiple of 32)
+    --train_resolution "1024x1024"  # (height x width)
 
     #########   Please keep consistent with deepspeed config file ##########
     --batch_size 1
     --gradient_accumulation_steps 1
     --mixed_precision "bf16"  # ["no", "fp16"]   Note: CogVideoX-2B only supports fp16 training
     ########################################################################
 
+    # When enable_packing is true, training will use the native image resolution
+    # (otherwise all images will be resized to train_resolution, which may distort the original aspect ratio).
+    #
+    # IMPORTANT: When changing enable_packing from true to false (or vice versa),
+    # make sure to clear the .cache directories in your data_root/train and data_root/test folders if they exist.
+    --enable_packing false
+
 )
 
 # System Configuration
 SYSTEM_ARGS=(
     --num_workers 8
-    --pin_memory True
+    --pin_memory true
     --nccl_timeout 1800
 )
 
@@ -62,7 +68,7 @@ VALIDATION_ARGS=(
 )
 
 # Combine all arguments and launch training
-accelerate launch --config_file ../configs/accelerate_config.yaml train.py \
+accelerate launch --config_file ../configs/accelerate_config.yaml train.py\
     "${MODEL_ARGS[@]}" \
     "${OUTPUT_ARGS[@]}" \
     "${DATA_ARGS[@]}" \

diff --git a/quickstart/scripts/train_zero_t2v.sh b/quickstart/scripts/train_zero_t2v.sh
@@ -20,31 +20,31 @@ OUTPUT_ARGS=(
 # Data Configuration
 DATA_ARGS=(
     --data_root "/path/to/data"
-
-    # Note:
-    #  for CogVideoX series models, number of training frames should be **8N+1**
-    #  for CogVideoX1.5 series models, number of training frames should be **16N+1**
-    --train_resolution "81x768x1360"  # (frames x height x width)
 )
 
 # Training Configuration
 TRAIN_ARGS=(
     --seed 42  # random seed
     --train_epochs 1  # number of training epochs
 
-    --learning_rate 2e-5
+    --learning_rate 5e-5
 
     #########   Please keep consistent with deepspeed config file ##########
     --batch_size 1
     --gradient_accumulation_steps 1
     --mixed_precision "bf16"  # ["no", "fp16"]   Note: CogVideoX-2B only supports fp16 training
     ########################################################################
+
+    # Note:
+    #  for CogVideoX series models, number of training frames should be **8N+1**
+    #  for CogVideoX1.5 series models, number of training frames should be **16N+1**
+    --train_resolution "81x768x1360"  # (frames x height x width)
 )
 
 # System Configuration
 SYSTEM_ARGS=(
     --num_workers 8
-    --pin_memory True
+    --pin_memory true
     --nccl_timeout 1800
 )