NVIDIA-NeMo
diff --git a/‎dfm/src/automodel/datasets/__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎dfm/src/automodel/datasets/__init__.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎dfm/src/automodel/datasets/mock_dataloader.py‎
Lines changed: 194 additions & 0 deletions b/‎dfm/src/automodel/datasets/mock_dataloader.py‎
Lines changed: 194 additions & 0 deletions
diff --git a/‎dfm/src/automodel/flow_matching/adapters/base.py‎
Lines changed: 0 additions & 26 deletions b/‎dfm/src/automodel/flow_matching/adapters/base.py‎
Lines changed: 0 additions & 26 deletions
diff --git a/‎dfm/src/automodel/flow_matching/adapters/hunyuan.py‎
Lines changed: 23 additions & 0 deletions b/‎dfm/src/automodel/flow_matching/adapters/hunyuan.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎dfm/src/automodel/recipes/train.py‎
Lines changed: 1 addition & 3 deletions b/‎dfm/src/automodel/recipes/train.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎examples/automodel/pretrain/pretrain.py‎
Lines changed: 2 additions & 2 deletions b/‎examples/automodel/pretrain/pretrain.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/automodel/pretrain/wan2_1_t2v_flow.yaml‎
Lines changed: 2 additions & 0 deletions b/‎examples/automodel/pretrain/wan2_1_t2v_flow.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tests/functional_tests/L2_Automodel_Wan21_Test.sh‎
Lines changed: 1 addition & 1 deletion b/‎tests/functional_tests/L2_Automodel_Wan21_Test.sh‎
Lines changed: 1 addition & 1 deletion
@@ -19,6 +19,11 @@
     collate_fn,
     create_dataloader,
 )
+from dfm.src.automodel.datasets.mock_dataloader import (
+    MockWanDataset,
+    build_mock_dataloader,
+    mock_collate_fn,
+)
 
 
 __all__ = [
@@ -27,4 +32,7 @@
     "build_node_parallel_sampler",
     "collate_fn",
     "create_dataloader",
+    "MockWanDataset",
+    "build_mock_dataloader",
+    "mock_collate_fn",
 ]
@@ -0,0 +1,194 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Mock dataloader for automodel WAN training tests.
+
+This module provides a mock dataset and dataloader that generates random
+tensors with the correct shapes for WAN 2.1 training, allowing functional
+tests to run without requiring real data.
+"""
+
+from typing import Dict, Optional, Tuple
+
+import torch
+from torch.utils.data import DataLoader, Dataset, DistributedSampler
+
+
+class MockWanDataset(Dataset):
+    """Mock dataset that generates random data matching WAN 2.1 expected format.
+
+    Args:
+        length: Number of samples in the dataset.
+        num_channels: Number of latent channels (default: 16 for WAN).
+        num_frame_latents: Number of temporal latent frames.
+        spatial_h: Height of spatial latents.
+        spatial_w: Width of spatial latents.
+        text_seq_len: Length of text sequence.
+        text_embed_dim: Dimension of text embeddings (default: 4096 for UMT5).
+        device: Device to place tensors on.
+    """
+
+    def __init__(
+        self,
+        length: int = 1024,
+        num_channels: int = 16,
+        num_frame_latents: int = 16,
+        spatial_h: int = 30,
+        spatial_w: int = 52,
+        text_seq_len: int = 77,
+        text_embed_dim: int = 4096,
+        device: str = "cpu",
+    ) -> None:
+        self.length = max(int(length), 1)
+        self.num_channels = num_channels
+        self.num_frame_latents = num_frame_latents
+        self.spatial_h = spatial_h
+        self.spatial_w = spatial_w
+        self.text_seq_len = text_seq_len
+        self.text_embed_dim = text_embed_dim
+        self.device = device
+
+    def __len__(self) -> int:
+        return self.length
+
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        """Generate a mock sample with random data.
+
+        Returns:
+            Dict containing:
+                - text_embeddings: [1, text_seq_len, text_embed_dim]
+                - video_latents: [1, num_channels, num_frame_latents, spatial_h, spatial_w]
+                - metadata: empty dict
+                - file_info: mock file info
+        """
+        # Generate random video latents: (1, C, T, H, W)
+        video_latents = torch.randn(
+            1,
+            self.num_channels,
+            self.num_frame_latents,
+            self.spatial_h,
+            self.spatial_w,
+            dtype=torch.float32,
+            device=self.device,
+        )
+
+        # Generate random text embeddings: (1, seq_len, embed_dim)
+        text_embeddings = torch.randn(
+            1,
+            self.text_seq_len,
+            self.text_embed_dim,
+            dtype=torch.float32,
+            device=self.device,
+        )
+
+        return {
+            "text_embeddings": text_embeddings,
+            "video_latents": video_latents,
+            "metadata": {},
+            "file_info": {
+                "meta_filename": f"mock_sample_{idx}.meta",
+                "original_filename": f"mock_video_{idx}.mp4",
+                "original_video_path": f"/mock/path/video_{idx}.mp4",
+                "deterministic_latents": True,
+                "memory_optimization": False,
+                "num_frames": self.num_frame_latents * 4,  # Approximate original frames
+            },
+        }
+
+
+def mock_collate_fn(batch):
+    """Collate function for mock dataset, matching the real collate_fn behavior."""
+    text_embeddings = torch.cat([item["text_embeddings"] for item in batch], dim=0)
+    video_latents = torch.cat([item["video_latents"] for item in batch], dim=0)
+
+    return {
+        "text_embeddings": text_embeddings,
+        "video_latents": video_latents,
+        "metadata": [item["metadata"] for item in batch],
+        "file_info": [item["file_info"] for item in batch],
+    }
+
+
+def build_mock_dataloader(
+    *,
+    dp_rank: int = 0,
+    dp_world_size: int = 1,
+    batch_size: int = 1,
+    num_workers: int = 0,
+    device: str = "cpu",
+    length: int = 1024,
+    num_channels: int = 16,
+    num_frame_latents: int = 16,
+    spatial_h: int = 30,
+    spatial_w: int = 52,
+    text_seq_len: int = 77,
+    text_embed_dim: int = 4096,
+    shuffle: bool = True,
+) -> Tuple[DataLoader, Optional[DistributedSampler]]:
+    """Build a mock dataloader for WAN training tests.
+
+    This function follows the same interface as build_dataloader but generates
+    random data instead of loading from .meta files.
+
+    Args:
+        dp_rank: Data parallel rank.
+        dp_world_size: Data parallel world size.
+        batch_size: Batch size per GPU.
+        num_workers: Number of dataloader workers.
+        device: Device to place tensors on.
+        length: Number of samples in mock dataset.
+        num_channels: Number of latent channels (default: 16).
+        num_frame_latents: Number of temporal latent frames.
+        spatial_h: Height of spatial latents.
+        spatial_w: Width of spatial latents.
+        text_seq_len: Length of text sequence.
+        text_embed_dim: Dimension of text embeddings.
+        shuffle: Whether to shuffle data.
+
+    Returns:
+        Tuple of (DataLoader, DistributedSampler or None).
+    """
+    dataset = MockWanDataset(
+        length=length,
+        num_channels=num_channels,
+        num_frame_latents=num_frame_latents,
+        spatial_h=spatial_h,
+        spatial_w=spatial_w,
+        text_seq_len=text_seq_len,
+        text_embed_dim=text_embed_dim,
+        device=device,
+    )
+
+    sampler = None
+    if dp_world_size > 1:
+        sampler = DistributedSampler(
+            dataset,
+            num_replicas=dp_world_size,
+            rank=dp_rank,
+            shuffle=shuffle,
+            drop_last=False,
+        )
+
+    use_pin_memory = device == "cpu"
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        shuffle=(sampler is None and shuffle),
+        sampler=sampler,
+        num_workers=num_workers,
+        collate_fn=mock_collate_fn,
+        pin_memory=use_pin_memory,
+    )
+
+    return dataloader, sampler
@@ -117,32 +117,6 @@ def forward(self, model: nn.Module, inputs: Dict[str, Any]) -> torch.Tensor:
         """
         pass
 
-    def get_condition_latents(self, latents: torch.Tensor, task_type: str) -> torch.Tensor:
-        """
-        Generate conditional latents based on task type.
-
-        Override this method if your model uses a different conditioning scheme.
-        Default implementation adds a channel for conditioning mask.
-
-        Args:
-            latents: Input latents [B, C, F, H, W]
-            task_type: Task type ("t2v" or "i2v")
-
-        Returns:
-            Conditional latents [B, C+1, F, H, W]
-        """
-        b, c, f, h, w = latents.shape
-        cond = torch.zeros([b, c + 1, f, h, w], device=latents.device, dtype=latents.dtype)
-
-        if task_type == "t2v":
-            return cond
-        elif task_type == "i2v":
-            cond[:, :-1, :1] = latents[:, :, :1]
-            cond[:, -1, 0] = 1
-            return cond
-        else:
-            raise ValueError(f"Unsupported task type: {task_type}")
-
     def post_process_prediction(self, model_pred: torch.Tensor) -> torch.Tensor:
         """
         Post-process model prediction if needed.
 
@@ -65,6 +65,29 @@ def __init__(
         self.default_image_embed_shape = default_image_embed_shape
         self.use_condition_latents = use_condition_latents
 
+    def get_condition_latents(self, latents: torch.Tensor, task_type: str) -> torch.Tensor:
+        """
+        Generate conditional latents based on task type.
+
+        Args:
+            latents: Input latents [B, C, F, H, W]
+            task_type: Task type ("t2v" or "i2v")
+
+        Returns:
+            Conditional latents [B, C+1, F, H, W]
+        """
+        b, c, f, h, w = latents.shape
+        cond = torch.zeros([b, c + 1, f, h, w], device=latents.device, dtype=latents.dtype)
+
+        if task_type == "t2v":
+            return cond
+        elif task_type == "i2v":
+            cond[:, :-1, :1] = latents[:, :, :1]
+            cond[:, -1, 0] = 1
+            return cond
+        else:
+            raise ValueError(f"Unsupported task type: {task_type}")
+
     def prepare_inputs(self, context: FlowMatchingContext) -> Dict[str, Any]:
         """
         Prepare inputs for HunyuanVideo model.
 
@@ -171,7 +171,7 @@ def setup(self):
         self.rng = StatefulRNG(seed=self.seed, ranked=True)
 
         self.model_id = self.cfg.get("model.pretrained_model_name_or_path")
-        self.attention_backend = self.cfg.get("model.attention_backend", "_flash_3_hub")
+        self.attention_backend = self.cfg.get("model.attention_backend")
         self.learning_rate = self.cfg.get("optim.learning_rate", 5e-6)
         self.bf16 = torch.bfloat16
 
@@ -250,8 +250,6 @@ def setup(self):
             raise ValueError(
                 "checkpoint config is required in YAML (enabled, checkpoint_dir, model_save_format, save_consolidated)"
             )
-        if not checkpoint_cfg.get("enabled", False):
-            raise ValueError("checkpoint.enabled must be true in YAML for diffusion training")
 
         # Build BaseRecipe-style checkpointing configuration (DCP/TORCH_SAVE) from YAML
         model_state_dict_keys = list(self.model.state_dict().keys())
 
@@ -16,12 +16,12 @@
 
 from nemo_automodel.components.config._arg_parser import parse_args_and_load_config
 
-from dfm.src.automodel.recipes.train import TrainWan21DiffusionRecipe
+from dfm.src.automodel.recipes.train import TrainDiffusionRecipe
 
 
 def main(default_config_path="examples/automodel/pretrain/wan2_1_t2v_flow.yaml"):
     cfg = parse_args_and_load_config(default_config_path)
-    recipe = TrainWan21DiffusionRecipe(cfg)
+    recipe = TrainDiffusionRecipe(cfg)
     recipe.setup()
     recipe.run_train_validation_loop()
 
 
@@ -37,6 +37,8 @@ optim:
 
 
 flow_matching:
+  adapter_type: "simple"  # Options: "hunyuan", "simple"
+  adapter_kwargs: {}
   use_sigma_noise: true
   timestep_sampling: uniform
   logit_mean: 0.0
 
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-CUDA_VISIBLE_DEVICES="0,1" uv run coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/automodel/wan21 -m "not pleasefixme" --with_downloads
+CUDA_VISIBLE_DEVICES="0,1" uv run --group automodel coverage run -a --data-file=/opt/DFM/.coverage --source=/opt/DFM/ -m pytest tests/functional_tests/automodel/wan21 -m "not pleasefixme" --with_downloads