feat: fixed mmdit + load from wandb checkpoint for training

AshishKumar4 · AshishKumar4 · commit 714939f63711 · 2025-05-04T16:53:25.000Z
diff --git a/flaxdiff/models/simple_mmdit.py b/flaxdiff/models/simple_mmdit.py
@@ -614,7 +614,7 @@ def setup(self):
         ]
         
         # Encoder blocks (from coarse to fine)
-        self.encoder_blocks = []
+        encoder_blocks = []
         for stage in range(num_stages):
             stage_blocks = [
                 MMDiTBlock(
@@ -632,7 +632,9 @@ def setup(self):
                 ) 
                 for i in range(self.num_layers[stage] // 2)  # Half for encoder, half for decoder
             ]
-            self.encoder_blocks.append(stage_blocks)
+            encoder_blocks.append(stage_blocks)
+            
+        self.encoder_blocks = encoder_blocks
         
         # Patch expanding layers (from coarse to fine)
         if num_stages > 1:
@@ -647,7 +649,7 @@ def setup(self):
             ]
         
         # Decoder blocks (from coarse to fine)
-        self.decoder_blocks = []
+        decoder_blocks = []
         for stage in range(num_stages-1, -1, -1):
             stage_blocks = [
                 MMDiTBlock(
@@ -665,7 +667,8 @@ def setup(self):
                 ) 
                 for i in range(self.num_layers[stage] // 2)  # Half for encoder, half for decoder
             ]
-            self.decoder_blocks.append(stage_blocks)
+            decoder_blocks.append(stage_blocks)
+        self.decoder_blocks = decoder_blocks
             
         # Fusion layers for skip connections
         if num_stages > 1:
diff --git a/flaxdiff/trainer/simple_trainer.py b/flaxdiff/trainer/simple_trainer.py
@@ -26,6 +26,7 @@
 from flaxdiff.utils import RandomMarkovState
 from flax.training import dynamic_scale as dynamic_scale_lib
 from dataclasses import dataclass
+import shutil
 import gc
 
 PROCESS_COLOR_MAP = {
@@ -73,6 +74,54 @@ class SimpleTrainState(train_state.TrainState):
     metrics: Metrics
     dynamic_scale: dynamic_scale_lib.DynamicScale
 
+def move_contents_to_subdir(target_dir, new_subdir_name):
+    # --- 1. Validate Target Directory ---
+    if not os.path.isdir(target_dir):
+        print(f"Error: Target directory '{target_dir}' not found or is not a directory.")
+        return
+    # --- 2. Define Paths ---
+    # Construct the full path for the new subdirectory
+    new_subdir_path = os.path.join(target_dir, new_subdir_name)
+    # --- 3. Create New Subdirectory ---
+    try:
+        # Create the subdirectory.
+        # exist_ok=True prevents an error if the directory already exists.
+        os.makedirs(new_subdir_path, exist_ok=True)
+        print(f"Subdirectory '{new_subdir_path}' created or already exists.")
+    except OSError as e:
+        print(f"Error creating subdirectory '{new_subdir_path}': {e}")
+        return # Stop execution if subdirectory creation fails
+    # --- 4. List Contents of Target Directory ---
+    try:
+        items_to_move = os.listdir(target_dir)
+    except OSError as e:
+        print(f"Error listing contents of '{target_dir}': {e}")
+        return # Stop if we can't list directory contents
+    # --- 5. Move Items ---
+    print(f"Moving items from '{target_dir}' to '{new_subdir_path}'...")
+    moved_count = 0
+    error_count = 0
+    for item_name in items_to_move:
+        # Construct the full path of the item in the target directory
+        source_path = os.path.join(target_dir, item_name)
+        # IMPORTANT: Skip the newly created subdirectory itself!
+        if source_path == new_subdir_path:
+            continue
+        # Construct the destination path inside the new subdirectory
+        destination_path = os.path.join(new_subdir_path, item_name)
+        # Move the item
+        try:
+            shutil.move(source_path, destination_path)
+            # print(f"  Moved: '{item_name}'") # Uncomment for verbose output
+            moved_count += 1
+        except Exception as e:
+            print(f"  Error moving '{item_name}': {e}")
+            error_count += 1
+    print(f"\nOperation complete.")
+    print(f"  Successfully moved: {moved_count} item(s).")
+    if error_count > 0:
+        print(f"  Errors encountered: {error_count} item(s).")
+
 @dataclass
 class SimpleTrainer:
     state: SimpleTrainState
@@ -124,6 +173,17 @@ def __init__(self,
                 if train_start_step_override is None:
                     train_start_step_override = run.summary['train/step'] + 1
                 print(f"Resuming from previous run {wandb_config['id']} with start step {train_start_step_override}")
+                
+                # If load_from_checkpoint is not set, and an artifact is found, load the artifact
+                if load_from_checkpoint is None:
+                    model_artifacts = [i for i in run.logged_artifacts() if i.type == 'model']
+                    if model_artifacts:
+                        artifact = model_artifacts[0]
+                        artifact_dir = artifact.download()
+                        print(f"Loading model from artifact {artifact.name} at {artifact_dir}")
+                        # Move the artifact's contents
+                        load_from_checkpoint = os.path.join(artifact_dir, str(run.summary['train/step']))
+                        move_contents_to_subdir(artifact_dir, load_from_checkpoint)
             
             # define our custom x axis metric
             self.wandb.define_metric("train/step")
@@ -272,6 +332,7 @@ def load(self, checkpoint_path=None, checkpoint_step=None):
             f"{step}")
         self.loaded_checkpoint_path = loaded_checkpoint_path
         ckpt = checkpointer.restore(step)
+        
         state = ckpt['state']
         best_state = ckpt['best_state']
         rngstate = ckpt['rngs']
@@ -590,5 +651,5 @@ def fit(self, data, train_steps_per_epoch, epochs, train_step_args={}, val_steps
                 )
                 print(colored(f"Validation done on process index {process_index}", PROCESS_COLOR_MAP[process_index]))
                 
-        self.save(epochs)
+        self.save(epochs)#
         return self.state

Original file line number	Diff line number	Diff line change
`@@ -614,7 +614,7 @@ def setup(self):`
`614`	`614`	`]`
`615`	`615`
`616`	`616`	`# Encoder blocks (from coarse to fine)`
`617`		`- self.encoder_blocks = []`
	`617`	`+ encoder_blocks = []`
`618`	`618`	`for stage in range(num_stages):`
`619`	`619`	`stage_blocks = [`
`620`	`620`	`MMDiTBlock(`
`@@ -632,7 +632,9 @@ def setup(self):`
`632`	`632`	`)`
`633`	`633`	`for i in range(self.num_layers[stage] // 2) # Half for encoder, half for decoder`
`634`	`634`	`]`
`635`		`- self.encoder_blocks.append(stage_blocks)`
	`635`	`+ encoder_blocks.append(stage_blocks)`
	`636`	`+`
	`637`	`+ self.encoder_blocks = encoder_blocks`
`636`	`638`
`637`	`639`	`# Patch expanding layers (from coarse to fine)`
`638`	`640`	`if num_stages > 1:`
`@@ -647,7 +649,7 @@ def setup(self):`
`647`	`649`	`]`
`648`	`650`
`649`	`651`	`# Decoder blocks (from coarse to fine)`
`650`		`- self.decoder_blocks = []`
	`652`	`+ decoder_blocks = []`
`651`	`653`	`for stage in range(num_stages-1, -1, -1):`
`652`	`654`	`stage_blocks = [`
`653`	`655`	`MMDiTBlock(`
`@@ -665,7 +667,8 @@ def setup(self):`
`665`	`667`	`)`
`666`	`668`	`for i in range(self.num_layers[stage] // 2) # Half for encoder, half for decoder`
`667`	`669`	`]`
`668`		`- self.decoder_blocks.append(stage_blocks)`
	`670`	`+ decoder_blocks.append(stage_blocks)`
	`671`	`+ self.decoder_blocks = decoder_blocks`
`669`	`672`
`670`	`673`	`# Fusion layers for skip connections`
`671`	`674`	`if num_stages > 1:`