hopefully better reverted

AshishKumar4 · AshishKumar4 · commit dfa5b4bc590d · 2025-05-05T05:39:44.000Z
diff --git a/flaxdiff/data/dataloaders.py b/flaxdiff/data/dataloaders.py
@@ -292,22 +292,22 @@ def get_dataset_grain(
         Dictionary with train dataset function and metadata.
     """
     dataset = datasetMap[data_name]
-    train_source = dataset["source"](dataset_source, split="train")
-    # val_source = dataset["source"](dataset_source, split="val")
+    data_source = dataset["source"](dataset_source)
     augmenter = dataset["augmenter"](image_scale, method)
+    filters = dataset.get("filter", None)(image_scale)
 
     local_batch_size = batch_size // jax.process_count()
 
     train_sampler = pygrain.IndexSampler(
-        num_records=len(train_source) if count is None else count,
+        num_records=len(data_source) if count is None else count,
         shuffle=True,
         seed=seed,
         num_epochs=num_epochs,
         shard_options=pygrain.ShardByJaxProcess(),
     )
 
     # val_sampler = pygrain.IndexSampler(
-    #     num_records=len(val_source) if count is None else count,
+    #     num_records=len(data_source) if count is None else count,
     #     shuffle=False,
     #     seed=seed,
     #     num_epochs=num_epochs,
@@ -327,7 +327,7 @@ def get_trainset():
         transformations.append(pygrain.Batch(local_batch_size, drop_remainder=True))
 
         loader = pygrain.DataLoader(
-            data_source=train_source,
+            data_source=data_source,
             sampler=train_sampler,
             operations=transformations,
             worker_count=worker_count,
@@ -345,22 +345,23 @@ def get_trainset():
     #     ]
 
     #     loader = pygrain.DataLoader(
-    #         data_source=train_source,
-    #         sampler=train_sampler,
+    #         data_source=data_source,
+    #         sampler=val_sampler,
     #         operations=transformations,
-    #         worker_count=2,
+    #         worker_count=worker_count,
     #         read_options=pygrain.ReadOptions(
     #             read_thread_count, read_buffer_size
     #         ),
-    #         worker_buffer_size=2,
+    #         worker_buffer_size=worker_buffer_size,
     #     )
     #     return loader
+    get_valset = get_trainset  # For now, use the same function for validation
 
     return {
         "train": get_trainset,
-        "train_len": len(train_source),
-        "val": get_trainset,
-        "val_len": len(train_source),
+        "train_len": len(data_source),
+        "val": get_valset,
+        "val_len": len(data_source),
         "local_batch_size": local_batch_size,
         "global_batch_size": batch_size,
     }
diff --git a/flaxdiff/data/dataset_map.py b/flaxdiff/data/dataset_map.py
@@ -21,8 +21,9 @@
         "augmenter": gcs_augmenters,
     },
     "laiona_coco": {
-        "source": data_source_gcs('datasets/laion12m+mscoco_filtered-new'),
+        "source": data_source_gcs('datasets/laion12m+mscoco'),
         "augmenter": gcs_augmenters,
+        "filter": gcs_filters,
     },
     "aesthetic_coyo": {
         "source": data_source_gcs('arrayrecords/aestheticCoyo_0.25clip_6aesthetic'),
diff --git a/flaxdiff/data/sources/images.py b/flaxdiff/data/sources/images.py
@@ -82,7 +82,7 @@ def load_labels(sample):
 class ImageTFDSSource(DataSource):
     """Data source for TensorFlow Datasets (TFDS) image datasets."""
     
-    def __init__(self, name: str, use_tf: bool = True):
+    def __init__(self, name: str, use_tf: bool = True, split: str = "all"):
         """Initialize a TFDS image data source.
         
         Args:
@@ -92,8 +92,9 @@ def __init__(self, name: str, use_tf: bool = True):
         """
         self.name = name
         self.use_tf = use_tf
+        self.split = split
     
-    def get_source(self, path_override: str, split: str = "all") -> Any:
+    def get_source(self, path_override: str) -> Any:
         """Get the TFDS data source.
         
         Args:
@@ -104,9 +105,9 @@ def get_source(self, path_override: str, split: str = "all") -> Any:
         """
         import tensorflow_datasets as tfds
         if self.use_tf:
-            return tfds.load(self.name, split=split, shuffle_files=True)
+            return tfds.load(self.name, split=self.split, shuffle_files=True)
         else:
-            return tfds.data_source(self.name, split=split, try_gcs=False)
+            return tfds.data_source(self.name, split=self.split, try_gcs=False)
 
 
 class ImageTFDSAugmenter(DataAugmenter):
@@ -198,7 +199,7 @@ def __init__(self, source: str = 'arrayrecord/laion-aesthetics-12m+mscoco-2017')
         """
         self.source = source
     
-    def get_source(self, path_override: str = "/home/mrwhite0racle/gcs_mount", split: str = "train") -> Any:
+    def get_source(self, path_override: str = "/home/mrwhite0racle/gcs_mount") -> Any:
         """Get the GCS data source.
         
         Args:
@@ -210,8 +211,6 @@ def get_source(self, path_override: str = "/home/mrwhite0racle/gcs_mount", split
         records_path = os.path.join(path_override, self.source)
         records = [os.path.join(records_path, i) for i in os.listdir(
             records_path) if 'array_record' in i]
-        if split == "val":
-            records = records[:1]
         return pygrain.ArrayRecordDataSource(records)
 
 
@@ -226,7 +225,7 @@ def __init__(self, sources: List[str] = []):
         """
         self.sources = sources
     
-    def get_source(self, path_override: str = "/home/mrwhite0racle/gcs_mount", split: str = "train") -> Any:
+    def get_source(self, path_override: str = "/home/mrwhite0racle/gcs_mount") -> Any:
         """Get the combined GCS data source.
         
         Args:
@@ -240,8 +239,6 @@ def get_source(self, path_override: str = "/home/mrwhite0racle/gcs_mount", split
         for records_path in records_paths:
             records += [os.path.join(records_path, i) for i in os.listdir(
                 records_path) if 'array_record' in i]
-        if split == "val":
-            records = records[:1]
         return pygrain.ArrayRecordDataSource(records)
 
 class ImageGCSAugmenter(DataAugmenter):
@@ -357,9 +354,9 @@ def filter(self, data: Dict[str, Any]) -> bool:
 
 # These functions maintain backward compatibility with existing code
 
-def data_source_tfds(name, use_tf=True):
+def data_source_tfds(name, use_tf=True, split="all"):
     """Legacy function for TFDS data sources."""
-    source = ImageTFDSSource(name=name, use_tf=use_tf)
+    source = ImageTFDSSource(name=name, use_tf=use_tf, split=split)
     return source.get_source
 
 
@@ -389,4 +386,4 @@ def gcs_augmenters(image_scale, method):
 def gcs_filters(image_scale):
     """Legacy function for GCS Filters."""
     augmenter = ImageGCSAugmenter()
-    return augmenter.create_filter(image_scale=image_scale)
+    return augmenter.create_filter(image_scale=image_scale)