NVIDIA-NeMo · akoumpa · Oct 31, 2025 · Oct 24, 2025 · Oct 27, 2025 · Oct 28, 2025
diff --git a/dfm/examples/Automodel/finetune/finetune.py b/dfm/examples/Automodel/finetune/finetune.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from Automodel.recipes.finetune import TrainWan21DiffusionRecipe
+from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
+
+
+def main(default_config_path="/opt/DFM/dfm/examples/Automodel/finetune/wan2_1_t2v_flow.yaml"):
+    cfg = parse_args_and_load_config(default_config_path)
+    recipe = TrainWan21DiffusionRecipe(cfg)
+    recipe.setup()
+    recipe.run_train_validation_loop()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dfm/examples/Automodel/finetune/wan2_1_t2v_flow.yaml b/dfm/examples/Automodel/finetune/wan2_1_t2v_flow.yaml
@@ -0,0 +1,59 @@
+seed: 42
+
+wandb:
+  project: wan-t2v-flow-matching
+  mode: online
+  name: wan2_1_t2v_fm_updated
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 30
+
+model:
+  pretrained_model_name_or_path: Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+
+data:
+  dataloader:
+    _target_: Automodel.datasets.build_wan21_dataloader
+    meta_folder: /lustre/fsw/portfolios/coreai/users/linnanw/hdvilla_sample/pika/wan21_codes/1.3B_meta/
+    batch_size: 1
+    num_workers: 2
+    device: cpu
+
+batch:
+  batch_size_per_node: 8
+
+training:
+  num_epochs: 100
+
+optim:
+  learning_rate: 5e-6
+  optimizer:
+    weight_decay: 0.01
+    betas: [0.9, 0.999]
+
+flow_matching:
+  use_sigma_noise: true
+  timestep_sampling: uniform
+  logit_mean: 0.0
+  logit_std: 1.0
+  flow_shift: 3.0
+  mix_uniform_ratio: 0.1
+
+fsdp:
+  tp_size: 1
+  cp_size: 1
+  pp_size: 1
+  dp_replicate_size: 1
+  dp_size: 8
+
+logging:
+  save_every: 1000
+  log_every: 2
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: /opt/DFM/wan_t2v_flow_outputs_base_recipe_fsdp_run_1/
+  model_save_format: torch_save
+  save_consolidated: false
+  restore_from: null
diff --git a/dfm/examples/Automodel/finetune/wan2_1_t2v_flow_multinode.yaml b/dfm/examples/Automodel/finetune/wan2_1_t2v_flow_multinode.yaml
@@ -0,0 +1,59 @@
+seed: 42
+
+wandb:
+  project: wan-t2v-flow-matching
+  mode: online
+  name: wan2_1_t2v_fm_updated
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 30
+
+model:
+  pretrained_model_name_or_path: Wan-AI/Wan2.1-T2V-1.3B-Diffusers
+
+data:
+  dataloader:
+    _target_: Automodel.datasets.build_wan21_dataloader
+    meta_folder: /lustre/fsw/portfolios/coreai/users/linnanw/hdvilla_sample/pika/wan21_codes/1.3B_meta/
+    batch_size: 1
+    num_workers: 2
+    device: cpu
+
+batch:
+  batch_size_per_node: 8
+
+training:
+  num_epochs: 100
+
+optim:
+  learning_rate: 5e-6
+  optimizer:
+    weight_decay: 0.01
+    betas: [0.9, 0.999]
+
+flow_matching:
+  use_sigma_noise: true
+  timestep_sampling: uniform
+  logit_mean: 0.0
+  logit_std: 1.0
+  flow_shift: 3.0
+  mix_uniform_ratio: 0.1
+
+fsdp:
+  tp_size: 1
+  cp_size: 1
+  pp_size: 1
+  dp_replicate_size: 2
+  dp_size: 16
+
+logging:
+  save_every: 1000
+  log_every: 2
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: /opt/DFM/wan_t2v_flow_outputs_base_recipe_multi_node_fsdp_run_3/
+  model_save_format: torch_save
+  save_consolidated: false
+  restore_from: null
diff --git a/dfm/examples/Automodel/generate/wan_generate.py b/dfm/examples/Automodel/generate/wan_generate.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import logging
+import os
+
+import torch
+import torch.distributed as dist
+from Automodel._diffusers.auto_diffusion_pipeline import NeMoAutoDiffusionPipeline
+from diffusers import AutoencoderKLWan
+from diffusers.utils import export_to_video
+from nemo_automodel.components.distributed.init_utils import initialize_distributed
+from nemo_automodel.components.loggers.log_utils import setup_logging
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Wan2.2 T2V FSDP2 generation")
+    default_prompt = (
+        "The video begins with a close-up of a white bowl filled with shredded coleslaw, "
+        "which has a mix of purple cabbage and white cabbage, and is garnished with a sprinkle "
+        "of seasoning. The coleslaw is placed on a wooden cutting board. As the video progresses, "
+        "the camera pans to the right, revealing a burger with a sesame seed bun, a beef patty, "
+        "melted yellow cheese, slices of red tomato, and crispy bacon."
+    )
+
+    parser.add_argument("--prompt", type=str, default=default_prompt, help="Text prompt for generation")
+    parser.add_argument("--height", type=int, default=480, help="Output video height")
+    parser.add_argument("--width", type=int, default=848, help="Output video width")
+    parser.add_argument("--num-frames", type=int, default=111, help="Number of frames to generate")
+    parser.add_argument("--guidance-scale", type=float, default=4.0, help="CFG scale for main guidance")
+    parser.add_argument("--guidance-scale-2", type=float, default=3.0, help="CFG scale for secondary guidance")
+    parser.add_argument("--num-inference-steps", type=int, default=20, help="Number of diffusion steps")
+    parser.add_argument("--fps", type=int, default=24, help="Frames per second for output video")
+    parser.add_argument("--output", type=str, default="t2v_fsdp2_rank0.mp4", help="Output video filename")
+    parser.add_argument("--seed", type=int, default=42, help="Base random seed (dp rank added)")
+    # Parallelism sizes
+    parser.add_argument(
+        "--tp-size",
+        type=int,
+        default=8,
+        help="Tensor-parallel group size",
+    )
+    parser.add_argument(
+        "--cp-size",
+        type=int,
+        default=1,
+        help="Context-parallel group size",
+    )
+    parser.add_argument(
+        "--pp-size",
+        type=int,
+        default=1,
+        help="Pipeline-parallel group size",
+    )
+    parser.add_argument(
+        "--dp-size",
+        type=int,
+        default=1,
+        help="Data-parallel group size",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    initialize_distributed(backend="nccl", timeout_minutes=10)
+    setup_logging()
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    torch.cuda.set_device(local_rank)
+    world_size = dist.get_world_size()
+    device = torch.device("cuda", local_rank)
+    bf16 = torch.bfloat16
+
+    # Configuration for TP+CP+PP+DP
+    tp_size = args.tp_size
+    cp_size = args.cp_size
+    pp_size = args.pp_size
+    dp_size = args.dp_size
+    dp_rank = local_rank // (tp_size * cp_size * pp_size)
+
+    # -------- Load pipeline --------
+    logging.info("[Loading] Loading VAE and pipeline...")
+    vae = AutoencoderKLWan.from_pretrained(
+        "Wan-AI/Wan2.2-T2V-A14B-Diffusers", subfolder="vae", torch_dtype=torch.bfloat16
+    )
+    # Build per-component managers mapping
+    manager_args = {
+        "dp_size": dp_size,
+        "tp_size": tp_size,
+        "cp_size": cp_size,
+        "pp_size": pp_size,
+        "backend": "nccl",
+        "world_size": world_size,
+        "use_hf_tp_plan": False,
+    }
+    # Wan pipelines typically have components like: 'vae', 'text_encoder', 'image_encoder', 'transformer', 'transformer_2'
+    # Parallelize only the heavy transformer components
+    parallel_scheme = {}
+    for name in ("transformer", "transformer_2"):
+        parallel_scheme[name] = manager_args
+
+    pipe, _ = NeMoAutoDiffusionPipeline.from_pretrained(
+        "Wan-AI/Wan2.2-T2V-A14B-Diffusers", vae=vae, torch_dtype=torch.bfloat16, parallel_scheme=parallel_scheme
+    )
+    logging.info("[Setup] Pipeline loaded and parallelized via NeMoAutoDiffusionPipeline")
+    dist.barrier()
+
+    # -------- Inference --------
+    logging.info("[Inference] Starting distributed inference...")
+    torch.manual_seed(args.seed + dp_rank)
+
+    with torch.no_grad(), torch.autocast(device_type="cuda", dtype=bf16):
+        out = pipe(
+            prompt=args.prompt,
+            height=args.height,
+            width=args.width,
+            num_frames=args.num_frames,
+            guidance_scale=args.guidance_scale,
+            guidance_scale_2=args.guidance_scale_2,
+            num_inference_steps=args.num_inference_steps,
+        ).frames[0]
+
+    if dist.get_rank() == 0:
+        export_to_video(out, args.output, fps=args.fps)
+        logging.info(f"[Inference] Saved {args.output}")
+
+    dist.barrier()
+    logging.info(
+        f"[Complete] Automodel FSDP2 inference completed! TP={tp_size}, CP={cp_size}, PP={pp_size}, DP={dp_size}"
+    )
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/dfm/src/dtensor/README.md → dfm/src/Automodel/README.md b/dfm/src/dtensor/README.md → dfm/src/Automodel/README.md
@@ -1,4 +1,4 @@
-# DTensor Models
+# Automodel
 
 Models implemented using Dtensors (NeMo Automodel)
 
diff --git a/dfm/src/dtensor/__init__.py → dfm/src/Automodel/__init__.py b/dfm/src/dtensor/__init__.py → dfm/src/Automodel/__init__.py
diff --git a/dfm/src/Automodel/_diffusers/__init__.py b/dfm/src/Automodel/_diffusers/__init__.py
@@ -0,0 +1,20 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .auto_diffusion_pipeline import NeMoAutoDiffusionPipeline
+
+
+__all__ = [
+    "NeMoAutoDiffusionPipeline",
+]