feat: version bump for releast

AshishKumar4 · AshishKumar4 · commit 36d5952b7e74 · 2025-05-02T04:34:36.000Z
diff --git a/flaxdiff/data/dataloaders.py b/flaxdiff/data/dataloaders.py
@@ -258,7 +258,7 @@ def get_dataset_grain(
     image_scale=256,
     count=None,
     num_epochs=None,
-    method=jax.image.ResizeMethod.LANCZOS3,
+    method=None, #jax.image.ResizeMethod.LANCZOS3,
     worker_count=32,
     read_thread_count=64,
     read_buffer_size=50,
diff --git a/flaxdiff/metrics/__init__.py b/flaxdiff/metrics/__init__.py
diff --git a/flaxdiff/metrics/common.py b/flaxdiff/metrics/common.py
@@ -0,0 +1,11 @@
+from typing import Callable
+from dataclasses import dataclass
+
+@dataclass
+class EvaluationMetric:
+    """
+    Evaluation metrics for the diffusion model.
+    The function is given generated samples batch [B, H, W, C] and the original batch.
+    """
+    function: Callable
+    name: str
diff --git a/flaxdiff/metrics/images.py b/flaxdiff/metrics/images.py
@@ -0,0 +1,59 @@
+from .common import EvaluationMetric
+import jax
+import jax.numpy as jnp
+    
+def get_clip_metric(
+    modelname: str = "openai/clip-vit-large-patch14",
+):
+    from transformers import AutoProcessor, FlaxCLIPModel
+    model = FlaxCLIPModel.from_pretrained(modelname, dtype=jnp.float16)
+    processor = AutoProcessor.from_pretrained(modelname, use_fast=True, dtype=jnp.float16)
+    
+    @jax.jit
+    def calc(pixel_values, input_ids, attention_mask):
+        # Get the logits
+        generated_out = model(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+        )
+            
+        gen_img_emb = generated_out.image_embeds
+        txt_emb = generated_out.text_embeds
+
+        # 1. Normalize embeddings (essential for cosine similarity/distance)
+        gen_img_emb = gen_img_emb / (jnp.linalg.norm(gen_img_emb, axis=-1, keepdims=True) + 1e-6)
+        txt_emb = txt_emb / (jnp.linalg.norm(txt_emb, axis=-1, keepdims=True) + 1e-6)
+
+        # 2. Calculate cosine similarity
+        # Using einsum for batch dot product: batch (b), embedding_dim (d) -> bd,bd->b
+        # Calculate cosine similarity
+        similarity = jnp.einsum('bd,bd->b', gen_img_emb, txt_emb)
+
+        scaled_distance = (1.0 - similarity)
+        # 4. Average over the batch
+        mean_scaled_distance = jnp.mean(scaled_distance)
+
+        return mean_scaled_distance
+        
+    def clip_metric(
+        generated: jnp.ndarray,
+        batch
+    ):
+        original_conditions = batch['text']
+        
+        # Convert samples from [-1, 1] to [0, 255] and uint8
+        generated = (((generated + 1.0) / 2.0) * 255).astype(jnp.uint8)
+        
+        generated_inputs = processor(images=generated, return_tensors="jax", padding=True,)
+        
+        pixel_values = generated_inputs['pixel_values']
+        input_ids = original_conditions['input_ids']
+        attention_mask = original_conditions['attention_mask']
+        
+        return calc(pixel_values, input_ids, attention_mask)
+    
+    return EvaluationMetric(
+        function=clip_metric,
+        name='clip_similarity'
+    )
diff --git a/flaxdiff/trainer/general_diffusion_trainer.py b/flaxdiff/trainer/general_diffusion_trainer.py
@@ -27,6 +27,8 @@
 from .diffusion_trainer import TrainState, DiffusionTrainer
 import shutil
 
+from flaxdiff.metrics.common import EvaluationMetric
+
 def generate_modelname(
     dataset_name: str,
     noise_schedule_name: str,
@@ -103,15 +105,6 @@ def generate_modelname(
     # model_name = f"{model_name}-{config_hash}"
     return model_name
 
-@dataclass
-class EvaluationMetric:
-    """
-    Evaluation metrics for the diffusion model.
-    The function is given generated samples batch [B, H, W, C] and the original batch.
-    """
-    function: Callable
-    name: str
-
 class GeneralDiffusionTrainer(DiffusionTrainer):
     """
     General trainer for diffusion models supporting both images and videos.
diff --git a/prototype_general_pipeline.ipynb b/prototype_general_pipeline.ipynb
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "flaxdiff"
-version = "0.2.3"
+version = "0.2.4"
 description = "A versatile and easy to understand Diffusion library"
 readme = "README.md"
 authors = [
diff --git a/training.py b/training.py
@@ -96,7 +96,9 @@ def boolean_string(s):
 parser.add_argument('--image_size', type=int, default=128, help='Image size')
 parser.add_argument('--epochs', type=int, default=100, help='Number of epochs')
 parser.add_argument('--steps_per_epoch', type=int,
-                    default=None, help='Steps per epoch')
+                    default=None, help='Training Steps per epoch')
+parser.add_argument('--val_steps_per_epoch', type=int,
+                    default=4, help='Validation Steps per epoch')
 parser.add_argument('--dataset', type=str,
                     default='laiona_coco', help='Dataset to use')
 parser.add_argument('--dataset_path', type=str,
@@ -171,6 +173,8 @@ def boolean_string(s):
 parser.add_argument('--wandb_project', type=str, default='mlops-msml605-project', help='Wandb project name')
 parser.add_argument('--wandb_entity', type=str, default='umd-projects', help='Wandb entity name')
 
+parser.add_argument('--val_metrics', type=str, nargs='+', default=['clip'], help='Validation metrics to use')
+
 # parser.add_argument('--wandb_project', type=str, default='flaxdiff', help='Wandb project name')
 # parser.add_argument('--wandb_entity', type=str, default='ashishkumar4', help='Wandb entity name')
 
@@ -373,6 +377,14 @@ def main(args):
         ]
     )
     
+    eval_metrics = []
+    # Validation metrics 
+    if args.val_metrics is not None:
+        if 'clip' in args.val_metrics:
+            from flaxdiff.metrics.images import get_clip_metric
+            print("Using CLIP metric for validation")
+            eval_metrics.append(get_clip_metric())
+    
     CONFIG = {
         "model": model_config,
         "architecture": args.architecture,
@@ -410,6 +422,9 @@ def main(args):
         experiment_name = "{name}_{date}".format(
             name="Diffusion_SDE_VE_TEXT", date=datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
         )
+    
+    if autoencoder is not None:
+        experiment_name = f"LDM-{experiment_name}"
         
     conf_args = CONFIG['arguments']
     conf_args['date'] = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
@@ -469,6 +484,7 @@ def main(args):
         use_dynamic_scale=args.use_dynamic_scale,
         native_resolution=IMAGE_SIZE,
         max_checkpoints_to_keep=args.max_checkpoints_to_keep,
+        eval_metrics=eval_metrics,
     )
     
     if trainer.distributed_training:
@@ -489,11 +505,12 @@ def get_val_dataset(batch_size=8):
     data['test_len'] = len(val_prompts)
 
     final_state = trainer.fit(
-        data, 
-        batches, 
+        data,
+        training_steps_per_epoch=batches,
         epochs=CONFIG['epochs'], 
         sampler_class=EulerAncestralSampler, 
         sampling_noise_schedule=karas_ve_schedule,
+        val_steps_per_epoch=args.val_steps_per_epoch,
     )
     
 if __name__ == '__main__':