Normalize the file path before writing for the cats_vs_dogs.py dataset

SanjaySG · SanjaySG · commit 1ed8c7236291 · 2025-01-26T23:34:02.000-08:00
diff --git a/tensorflow_datasets/image_classification/cats_vs_dogs.py b/tensorflow_datasets/image_classification/cats_vs_dogs.py
@@ -16,6 +16,7 @@
 """Cats vs Dogs dataset."""
 
 import io
+import os
 import re
 import zipfile
 
@@ -42,90 +43,91 @@
 )
 _NUM_CORRUPT_IMAGES = 1738
 _DESCRIPTION = (
-    "A large set of images of cats and dogs. "
-    "There are %d corrupted images that are dropped." % _NUM_CORRUPT_IMAGES
+        "A large set of images of cats and dogs. "
+        "There are %d corrupted images that are dropped." % _NUM_CORRUPT_IMAGES
 )
 
 _NAME_RE = re.compile(r"^PetImages[\\/](Cat|Dog)[\\/]\d+\.jpg$")
 
 
 class CatsVsDogs(tfds.core.GeneratorBasedBuilder):
-  """Cats vs Dogs."""
-
-  VERSION = tfds.core.Version("4.0.1")
-  RELEASE_NOTES = {
-      "4.0.0": "New split API (https://tensorflow.org/datasets/splits)",
-      "4.0.1": (
-          "Recoding images in generator to fix corrupt JPEG data warnings"
-          " (https://github.com/tensorflow/datasets/issues/2188)"
-      ),
-  }
-
-  def _info(self):
-    return tfds.core.DatasetInfo(
-        builder=self,
-        description=_DESCRIPTION,
-        features=tfds.features.FeaturesDict({
-            "image": tfds.features.Image(),
-            "image/filename": tfds.features.Text(),  # eg 'PetImages/Dog/0.jpg'
-            "label": tfds.features.ClassLabel(names=["cat", "dog"]),
-        }),
-        supervised_keys=("image", "label"),
-        homepage=(
-            "https://www.microsoft.com/en-us/download/details.aspx?id=54765"
+    """Cats vs Dogs."""
+
+    VERSION = tfds.core.Version("4.0.1")
+    RELEASE_NOTES = {
+        "4.0.0": "New split API (https://tensorflow.org/datasets/splits)",
+        "4.0.1": (
+            "Recoding images in generator to fix corrupt JPEG data warnings"
+            " (https://github.com/tensorflow/datasets/issues/2188)"
         ),
-        citation=_CITATION,
-    )
-
-  def _split_generators(self, dl_manager):
-    path = dl_manager.download(_URL)
-
-    # There is no predefined train/val/test split for this dataset.
-    return [
-        tfds.core.SplitGenerator(
-            name=tfds.Split.TRAIN,
-            gen_kwargs={
-                "archive": dl_manager.iter_archive(path),
-            },
-        ),
-    ]
-
-  def _generate_examples(self, archive):
-    """Generate Cats vs Dogs images and labels given a directory path."""
-    num_skipped = 0
-    for fname, fobj in archive:
-      res = _NAME_RE.match(fname)
-      if not res:  # README file, ...
-        continue
-      label = res.group(1).lower()
-      if tf.compat.as_bytes("JFIF") not in fobj.peek(10):
-        num_skipped += 1
-        continue
-
-      # Some images caused 'Corrupt JPEG data...' messages during training or
-      # any other iteration recoding them once fixes the issue (discussion:
-      # https://github.com/tensorflow/datasets/issues/2188).
-      # Those messages are now displayed when generating the dataset instead.
-      img_data = fobj.read()
-      img_tensor = tf.image.decode_image(img_data)
-      img_recoded = tf.io.encode_jpeg(img_tensor)
-
-      # Converting the recoded image back into a zip file container.
-      buffer = io.BytesIO()
-      with zipfile.ZipFile(buffer, "w") as new_zip:
-        new_zip.writestr(fname, img_recoded.numpy())
-      new_fobj = zipfile.ZipFile(buffer).open(fname)
-
-      record = {
-          "image": new_fobj,
-          "image/filename": fname,
-          "label": label,
-      }
-      yield fname, record
-
-    if num_skipped != _NUM_CORRUPT_IMAGES:
-      raise ValueError(
-          "Expected %d corrupt images, but found %d"
-          % (_NUM_CORRUPT_IMAGES, num_skipped)
-      )
-    logging.warning("%d images were corrupted and were skipped", num_skipped)
+    }
+
+    def _info(self):
+        return tfds.core.DatasetInfo(
+            builder=self,
+            description=_DESCRIPTION,
+            features=tfds.features.FeaturesDict({
+                "image": tfds.features.Image(),
+                "image/filename": tfds.features.Text(),  # eg 'PetImages/Dog/0.jpg'
+                "label": tfds.features.ClassLabel(names=["cat", "dog"]),
+            }),
+            supervised_keys=("image", "label"),
+            homepage=(
+                "https://www.microsoft.com/en-us/download/details.aspx?id=54765"
+            ),
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        path = dl_manager.download(_URL)
+
+        # There is no predefined train/val/test split for this dataset.
+        return [
+            tfds.core.SplitGenerator(
+                name=tfds.Split.TRAIN,
+                gen_kwargs={
+                    "archive": dl_manager.iter_archive(path),
+                },
+            ),
+        ]
+
+    def _generate_examples(self, archive):
+        """Generate Cats vs Dogs images and labels given a directory path."""
+        num_skipped = 0
+        for fname, fobj in archive:
+            norm_fname = os.path.normpath(fname)
+            res = _NAME_RE.match(norm_fname)
+            if not res:  # README file, ...
+                continue
+            label = res.group(1).lower()
+            if tf.compat.as_bytes("JFIF") not in fobj.peek(10):
+                num_skipped += 1
+                continue
+
+            # Some images caused 'Corrupt JPEG data...' messages during training or
+            # any other iteration recoding them once fixes the issue (discussion:
+            # https://github.com/tensorflow/datasets/issues/2188).
+            # Those messages are now displayed when generating the dataset instead.
+            img_data = fobj.read()
+            img_tensor = tf.image.decode_image(img_data)
+            img_recoded = tf.io.encode_jpeg(img_tensor)
+
+            # Converting the recoded image back into a zip file container.
+            buffer = io.BytesIO()
+            with zipfile.ZipFile(buffer, "w") as new_zip:
+                new_zip.writestr(norm_fname, img_recoded.numpy())
+            new_fobj = zipfile.ZipFile(buffer).open(norm_fname)
+
+            record = {
+                "image": new_fobj,
+                "image/filename": norm_fname,
+                "label": label,
+            }
+            yield norm_fname, record
+
+        if num_skipped != _NUM_CORRUPT_IMAGES:
+            raise ValueError(
+                "Expected %d corrupt images, but found %d"
+                % (_NUM_CORRUPT_IMAGES, num_skipped)
+            )
+        logging.warning("%d images were corrupted and were skipped", num_skipped)