feat: data preprocessing pipeline

AshishKumar4 · AshishKumar4 · commit e02ecd2b54d0 · 2025-05-03T23:24:20.000Z
diff --git a/data-processing.py b/data-processing.py
diff --git a/dataset_preprocessing.ipynb b/dataset_preprocessing.ipynb
diff --git a/datasets/gcsfuse.sh b/datasets/gcsfuse.sh
@@ -47,6 +47,6 @@ mkdir -p $MOUNT_PATH
 # Grain uses _PROCESS_MANAGEMENT_MAX_THREADS = 64 (https://github.com/google/grain/blob/main/grain/_src/python/grain_pool.py)
 # Please make sure max-conns-per-host > grain_worker_count * _PROCESS_MANAGEMENT_MAX_THREADS
 
-gcsfuse -o ro --implicit-dirs --http-client-timeout=5s --max-conns-per-host=0 --max-idle-conns-per-host=100000 \
-        --experimental-enable-json-read --kernel-list-cache-ttl-secs=-1 -o ro --config-file=$HOME/gcsfuse.yml \
-        --log-file=$HOME/gcsfuse.json "$DATASET_GCS_BUCKET" "$MOUNT_PATH" 
+gcsfuse -o rw --implicit-dirs --http-client-timeout=5s --max-conns-per-host=0 --max-idle-conns-per-host=100000 \
+        --experimental-enable-json-read --kernel-list-cache-ttl-secs=-1 -o rw --config-file=$HOME/gcsfuse.yml \
+        --log-file=$HOME/gcsfuse.json "$DATASET_GCS_BUCKET" "$MOUNT_PATH" 
diff --git a/flaxdiff/data/dataloaders.py b/flaxdiff/data/dataloaders.py
@@ -251,6 +251,12 @@ def fallback_batch(batch, media_type="image"):
     else:  # Default to image
         return image_collate
 
+class CaptionDeletionTransform(pygrain.MapTransform):
+    def map(self, element):
+        """Delete the caption from the element."""
+        if "caption" in element:
+            del element["caption"]
+        return element
 
 def get_dataset_grain(
     data_name="cc12m",
@@ -288,6 +294,7 @@ def get_dataset_grain(
     dataset = datasetMap[data_name]
     data_source = dataset["source"](dataset_source)
     augmenter = dataset["augmenter"](image_scale, method)
+    filters = dataset.get("filter", None)(image_scale)
 
     local_batch_size = batch_size // jax.process_count()
 
@@ -310,8 +317,14 @@ def get_dataset_grain(
     def get_trainset():
         transformations = [
             augmenter(),
-            pygrain.Batch(local_batch_size, drop_remainder=True),
         ]
+        
+        if filters:
+            print("Adding filters to transformations")
+            transformations.append(filters())
+            
+        transformations.append(CaptionDeletionTransform())
+        transformations.append(pygrain.Batch(local_batch_size, drop_remainder=True))
 
         loader = pygrain.DataLoader(
             data_source=data_source,
diff --git a/flaxdiff/data/dataset_map.py b/flaxdiff/data/dataset_map.py
@@ -8,7 +8,7 @@
 # ---------------------------------------------------------------------------------
 
 from .sources.images import data_source_tfds, tfds_augmenters, data_source_gcs
-from .sources.images import data_source_combined_gcs, gcs_augmenters
+from .sources.images import data_source_combined_gcs, gcs_augmenters, gcs_filters
 
 # Configure the following for your datasets
 datasetMap = {
@@ -23,6 +23,7 @@
     "laiona_coco": {
         "source": data_source_gcs('datasets/laion12m+mscoco'),
         "augmenter": gcs_augmenters,
+        "filter": gcs_filters,
     },
     "aesthetic_coyo": {
         "source": data_source_gcs('arrayrecords/aestheticCoyo_0.25clip_6aesthetic'),
diff --git a/flaxdiff/data/sources/base.py b/flaxdiff/data/sources/base.py
@@ -62,6 +62,18 @@ def create_transform(self, **kwargs) -> Callable[[], pygrain.MapTransform]:
         """
         pass
     
+    @abstractmethod
+    def create_filter(self, **kwargs) -> Callable[[], pygrain.FilterTransform]:
+        """Create a filter function for the data.
+        
+        Args:
+            **kwargs: Additional arguments for the filter.
+            
+        Returns:
+            A callable that returns a pygrain.FilterTransform instance.
+        """
+        pass
+    
     @staticmethod
     def create(augmenter_type: str, **kwargs) -> 'DataAugmenter':
         """Factory method to create a data augmenter of the specified type.
diff --git a/flaxdiff/data/sources/images.py b/flaxdiff/data/sources/images.py
@@ -168,7 +168,11 @@ def map(self, element) -> Dict[str, jnp.array]:
                 }
         
         return TFDSTransform
-
+    
+    def create_filter(self, image_scale: int = 256):
+        class FilterTransform(pygrain.FilterTransform):
+            def map(self, element) -> bool:
+                return True
 """
 Batch structure:
 {
@@ -237,7 +241,6 @@ def get_source(self, path_override: str = "/home/mrwhite0racle/gcs_mount") -> An
                 records_path) if 'array_record' in i]
         return pygrain.ArrayRecordDataSource(records)
 
-
 class ImageGCSAugmenter(DataAugmenter):
     """Augmenter for GCS image datasets."""
     
@@ -290,13 +293,60 @@ def map(self, element) -> Dict[str, jnp.array]:
                 results = self.auto_tokenize(caption)
                 return {
                     "image": image,
+                    "caption": caption,
                     "text": {
                         "input_ids": results['input_ids'][0],
                         "attention_mask": results['attention_mask'][0],
                     }
                 }
         
         return GCSTransform
+    
+    def create_filter(self, image_scale: int = 256):
+        import torch.nn.functional as F
+        class FilterTransform(pygrain.FilterTransform):
+            """
+            Filter transform for GCS data source.
+            """
+            def __init__(self, model=None, processor=None, method=cv2.INTER_AREA):
+                super().__init__()
+                self.image_scale = image_scale
+                if model is None:
+                    from transformers import AutoProcessor, CLIPVisionModelWithProjection, FlaxCLIPModel, CLIPModel
+                    model_name = "openai/clip-vit-base-patch32"
+                    model = CLIPModel.from_pretrained(model_name)
+                    processor = AutoProcessor.from_pretrained(model_name, use_fast=False)
+                self.method = method
+                self.model = model
+                self.processor = processor
+                
+                # def _filter_(pixel_values, input_ids):
+                #     image_embeds = self.model.get_image_features(pixel_values=pixel_values)
+                #     text_embeds = self.model.get_text_features(input_ids=input_ids)
+                #     image_embeds = image_embeds / jnp.linalg.norm(image_embeds, axis=-1, keepdims=True)
+                #     text_embeds = text_embeds / jnp.linalg.norm(text_embeds, axis=-1, keepdims=True)
+                #     similarity = jnp.sum(image_embeds * text_embeds, axis=-1)
+                #     return jnp.all(similarity >= 0.25)
+                
+                # self._filter_ = _filter_
+
+            def filter(self, data: Dict[str, Any]) -> bool:
+                images = [data['image']]
+                texts = [data['caption']]
+                inputs = self.processor(text=texts, images=images, return_tensors="pt", padding=True, truncation=True)
+                # result = self._filter_(
+                #     pixel_values=inputs['pixel_values'],
+                #     input_ids=inputs['input_ids']
+                # )
+                # return result
+                
+                image_embeds = self.model.get_image_features(pixel_values=inputs['pixel_values'])
+                text_embeds = self.model.get_text_features(input_ids=inputs['input_ids'])
+                similarity = F.cosine_similarity(image_embeds, text_embeds)
+                # Filter out images with similarity less than 0.25
+                return similarity[0] >= 0.25
+                
+        return FilterTransform
 
 
 # ----------------------------------------------------------------------------------
@@ -333,3 +383,8 @@ def gcs_augmenters(image_scale, method):
     """Legacy function for GCS augmenters."""
     augmenter = ImageGCSAugmenter()
     return augmenter.create_transform(image_scale=image_scale, method=method)
+
+def gcs_filters(image_scale):
+    """Legacy function for GCS Filters."""
+    augmenter = ImageGCSAugmenter()
+    return augmenter.create_filter(image_scale=image_scale)
diff --git a/flaxdiff/data/sources/videos.py b/flaxdiff/data/sources/videos.py
@@ -216,6 +216,11 @@ def random_map(self, element, rng: np.random.Generator) -> Dict[str, jnp.array]:
         
         return AudioVideoTransform
 
+    
+    def create_filter(self, image_scale: int = 256):
+        class FilterTransform(pygrain.FilterTransform):
+            def map(self, element) -> bool:
+                return True
 
 # ----------------------------------------------------------------------------------
 # Helper functions for video datasets