AshishKumar4
diff --git a/‎README.md‎
Lines changed: 2 additions & 0 deletions b/‎README.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎flaxdiff/data/dataloaders.py‎
Lines changed: 31 additions & 2 deletions b/‎flaxdiff/data/dataloaders.py‎
Lines changed: 31 additions & 2 deletions
diff --git a/‎flaxdiff/data/dataset_map.py‎
Lines changed: 1 addition & 1 deletion b/‎flaxdiff/data/dataset_map.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎flaxdiff/data/sources/images.py‎
Lines changed: 17 additions & 0 deletions b/‎flaxdiff/data/sources/images.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎flaxdiff/trainer/general_diffusion_trainer.py‎
Lines changed: 74 additions & 23 deletions b/‎flaxdiff/trainer/general_diffusion_trainer.py‎
Lines changed: 74 additions & 23 deletions
diff --git a/‎flaxdiff/trainer/simple_trainer.py‎
Lines changed: 7 additions & 5 deletions b/‎flaxdiff/trainer/simple_trainer.py‎
Lines changed: 7 additions & 5 deletions
@@ -1,5 +1,7 @@
 # ![](images/logo.jpeg "FlaxDiff")
 
+**This project is being used for the UMD Course project MSML 605: MLOps**
+
 **This project is partially supported by [Google TPU Research Cloud](https://sites.research.google/trc/about/). I would like to thank the Google Cloud TPU team for providing me with the resources to train the bigger text-conditional models in multi-host distributed settings.**
 
 ## A Versatile and simple Diffusion Library
 
@@ -291,14 +291,22 @@ def get_dataset_grain(
 
     local_batch_size = batch_size // jax.process_count()
 
-    sampler = pygrain.IndexSampler(
+    train_sampler = pygrain.IndexSampler(
         num_records=len(data_source) if count is None else count,
         shuffle=True,
         seed=seed,
         num_epochs=num_epochs,
         shard_options=pygrain.ShardByJaxProcess(),
     )
 
+    # val_sampler = pygrain.IndexSampler(
+    #     num_records=len(data_source) if count is None else count,
+    #     shuffle=False,
+    #     seed=seed,
+    #     num_epochs=num_epochs,
+    #     shard_options=pygrain.ShardByJaxProcess(),
+    # )
+    
     def get_trainset():
         transformations = [
             augmenter(),
@@ -307,7 +315,7 @@ def get_trainset():
 
         loader = pygrain.DataLoader(
             data_source=data_source,
-            sampler=sampler,
+            sampler=train_sampler,
             operations=transformations,
             worker_count=worker_count,
             read_options=pygrain.ReadOptions(
@@ -316,10 +324,31 @@ def get_trainset():
             worker_buffer_size=worker_buffer_size,
         )
         return loader
+    
+    # def get_valset():
+    #     transformations = [
+    #         augmenter(),
+    #         pygrain.Batch(local_batch_size, drop_remainder=True),
+    #     ]
+
+    #     loader = pygrain.DataLoader(
+    #         data_source=data_source,
+    #         sampler=val_sampler,
+    #         operations=transformations,
+    #         worker_count=worker_count,
+    #         read_options=pygrain.ReadOptions(
+    #             read_thread_count, read_buffer_size
+    #         ),
+    #         worker_buffer_size=worker_buffer_size,
+    #     )
+    #     return loader
+    get_valset = get_trainset  # For now, use the same function for validation
 
     return {
         "train": get_trainset,
         "train_len": len(data_source),
+        "val": get_valset,
+        "val_len": len(data_source),
         "local_batch_size": local_batch_size,
         "global_batch_size": batch_size,
     }
 
@@ -21,7 +21,7 @@
         "augmenter": gcs_augmenters,
     },
     "laiona_coco": {
-        "source": data_source_gcs('arrayrecord2/laion-aesthetics-12m+mscoco-2017'),
+        "source": data_source_gcs('datasets/laion12m+mscoco'),
         "augmenter": gcs_augmenters,
     },
     "aesthetic_coyo": {
 
@@ -167,6 +167,16 @@ def map(self, element) -> Dict[str, jnp.array]:
 
         return TFDSTransform
 
+"""
+Batch structure:
+{
+"image": image_batch,
+"text": {
+    "input_ids": input_ids_batch,
+    "attention_mask": attention_mask_batch,
+}
+
+"""
 
 # ----------------------------------------------------------------------------------
 # GCS Image Source
@@ -248,6 +258,13 @@ def create_transform(self, image_scale: int = 256, method: Any = None) -> Callab
             A callable that returns a pygrain.MapTransform.
         """
         labelizer = self.labelizer
+        if method is None:
+            if image_scale > 256:
+                method = cv2.INTER_CUBIC
+            else:
+                method = cv2.INTER_AREA
+                
+        print(f"Using method: {method}")
 
         class GCSTransform(pygrain.MapTransform):
             def __init__(self, *args, **kwargs):
 
@@ -18,13 +18,13 @@
 from flaxdiff.utils import RandomMarkovState, serialize_model, get_latest_checkpoint
 from flaxdiff.inputs import ConditioningEncoder, ConditionalInputConfig, DiffusionInputConfig
 
-from .simple_trainer import SimpleTrainer, SimpleTrainState, Metrics
+from .simple_trainer import SimpleTrainer, SimpleTrainState, Metrics, convert_to_global_tree
 
 from flaxdiff.models.autoencoder.autoencoder import AutoEncoder
 from flax.training import dynamic_scale as dynamic_scale_lib
 
 # Reuse the TrainState from the DiffusionTrainer
-from flaxdiff.trainer.diffusion_trainer import TrainState, DiffusionTrainer
+from .diffusion_trainer import TrainState, DiffusionTrainer
 import shutil
 
 def generate_modelname(
@@ -103,6 +103,15 @@ def generate_modelname(
     # model_name = f"{model_name}-{config_hash}"
     return model_name
 
+@dataclass
+class EvaluationMetric:
+    """
+    Evaluation metrics for the diffusion model.
+    The function is given generated samples batch [B, H, W, C] and the original batch.
+    """
+    function: Callable
+    name: str
+
 class GeneralDiffusionTrainer(DiffusionTrainer):
     """
     General trainer for diffusion models supporting both images and videos.
@@ -126,6 +135,7 @@ def __init__(self,
                  native_resolution: int = None,
                  frames_per_sample: int = None,
                  wandb_config: Dict[str, Any] = None,
+                 eval_metrics: List[EvaluationMetric] = None,
                  **kwargs
                  ):
         """
@@ -150,6 +160,7 @@ def __init__(self,
             autoencoder=autoencoder,
         )
         self.input_config = input_config
+        self.eval_metrics = eval_metrics
 
         if wandb_config is not None:
             # If input_config is not in wandb_config, add it
@@ -363,7 +374,6 @@ def _define_validation_step(self, sampler_class: Type[DiffusionSampler]=DDIMSamp
         def generate_samples(
             val_state: TrainState,
             batch,
-            sampler: DiffusionSampler, 
             diffusion_steps: int,
         ):
             # Process all conditional inputs
@@ -385,7 +395,7 @@ def generate_samples(
                 model_conditioning_inputs=tuple(model_conditioning_inputs),
             )
 
-        return sampler, generate_samples
+        return generate_samples
 
     def _get_image_size(self):
         """Helper to determine image size from available information."""
@@ -415,32 +425,73 @@ def validation_loop(
         """
         Run validation and log samples for both image and video diffusion.
         """
-        sampler, generate_samples = val_step_fn
-        val_ds = iter(val_ds()) if val_ds else None
+        global_device_count = jax.device_count()
+        local_device_count = jax.local_device_count()
+        process_index = jax.process_index()
+        generate_samples = val_step_fn
 
+        val_ds = iter(val_ds()) if val_ds else None
+        # Evaluation step
         try:
-            # Generate samples
-            samples = generate_samples(
-                val_state,
-                next(val_ds),
-                sampler,
-                diffusion_steps,
-            )
-            
-            # Log samples to wandb
-            if getattr(self, 'wandb', None) is not None and self.wandb:
-                import numpy as np
+            metrics = {metric.name: [] for metric in self.eval_metrics} if self.eval_metrics else {}
+            for i in range(val_steps_per_epoch):
+                if val_ds is None:
+                    batch = None
+                else:
+                    batch = next(val_ds)
+                    if self.distributed_training and global_device_count > 1:
+                        batch = convert_to_global_tree(self.mesh, batch)
+                # Generate samples
+                samples = generate_samples(
+                    val_state,
+                    batch,
+                    diffusion_steps,
+                )
 
-                # Process samples differently based on dimensionality
-                if len(samples.shape) == 5:  # [B,T,H,W,C] - Video data
-                    self._log_video_samples(samples, current_step)
-                else:  # [B,H,W,C] - Image data
-                    self._log_image_samples(samples, current_step)
+                if self.eval_metrics is not None:
+                    for metric in self.eval_metrics:
+                        try:
+                            # Evaluate metrics
+                            metric_val = metric.function(samples, batch)
+                            metrics[metric.name].append(metric_val)
+                        except Exception as e:
+                            print("Error in evaluation metrics:", e)
+                            import traceback
+                            traceback.print_exc()
+                            pass
 
+                if i == 0:
+                    print(f"Evaluation started for process index {process_index}")
+                    # Log samples to wandb
+                    if getattr(self, 'wandb', None) is not None and self.wandb:
+                        import numpy as np
+                        
+                        # Process samples differently based on dimensionality
+                        if len(samples.shape) == 5:  # [B,T,H,W,C] - Video data
+                            self._log_video_samples(samples, current_step)
+                        else:  # [B,H,W,C] - Image data
+                            self._log_image_samples(samples, current_step)
+                    
+            if getattr(self, 'wandb', None) is not None and self.wandb:
+                # metrics is a dict of metrics
+                if metrics and type(metrics) == dict:
+                    # Flatten the metrics
+                    metrics = {k: np.mean(v) for k, v in metrics.items()}
+                    # Log the metrics
+                    for key, value in metrics.items():
+                        if isinstance(value, jnp.ndarray):
+                            value = np.array(value)
+                        self.wandb.log({
+                            f"val/{key}": value,
+                        }, step=current_step)
+                        
+        except StopIteration:
+            print(f"Validation dataset exhausted for process index {process_index}")
         except Exception as e:
-            print("Error in validation loop:", e)
+            print(f"Error during validation for process index {process_index}: {e}")
             import traceback
             traceback.print_exc()
+
 
     def _log_video_samples(self, samples, current_step):
         """Helper to log video samples to wandb."""
 
@@ -411,7 +411,9 @@ def train_loop(
         train_ds,
         train_steps_per_epoch,
         current_step,
-        rng_state
+        rng_state,
+        save_every:int=None,
+        val_every=None,
     ):
         global_device_count = jax.device_count()
         process_index = jax.process_index()
@@ -491,8 +493,8 @@ def train_loop(
                             "train/loss": loss,
                         }, step=current_step)
                 # Save the model every few steps
-                if i % 10000 == 0 and i > 0:
-                    print(f"Saving model after 10000 step {current_step}")
+                if save_every and i % save_every == 0 and i > 0:
+                    print(f"Saving model after {save_every} step {current_step}")
                     print(f"Devices: {len(jax.devices())}") # To sync the devices
                     self.save(current_epoch, current_step, train_state, rng_state)
                     print(f"Saving done by process index {process_index}")
@@ -518,7 +520,7 @@ def fit(self, data, train_steps_per_epoch, epochs, train_step_args={}, val_steps
             self.validation_loop(
                 train_state,
                 val_step,
-                data.get('test', data.get('val', None)),
+                data.get('val', data.get('test', None)),
                 val_steps_per_epoch,
                 self.latest_step,
             )
@@ -569,7 +571,7 @@ def fit(self, data, train_steps_per_epoch, epochs, train_step_args={}, val_steps
                 self.validation_loop(
                     train_state,
                     val_step,
-                    data.get('test', None),
+                    data.get('val', data.get('test', None)),
                     val_steps_per_epoch,
                     current_step,
                 )